├── .github └── workflows │ ├── install.yml │ ├── pretrain.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── cs-cs.var.tok.txt ├── cs-cs.var.tok.txt.codebert.vocab ├── docs └── _static │ └── images │ ├── squareslab.png │ ├── strudel.png │ ├── training.jpeg │ └── training.png ├── setup.py ├── tests └── reproduce │ └── test_idbench.py └── varclr ├── __init__.py ├── benchmarks ├── __init__.py ├── benchmark.py └── idbench │ ├── large_pair_wise.csv │ ├── medium_pair_wise.csv │ └── small_pair_wise.csv ├── data ├── __init__.py ├── dataset.py ├── preprocessor.py └── vocab.py ├── models ├── __init__.py ├── encoders.py ├── loss.py ├── model.py ├── tokenizers.py └── urls_pretrained_model.py ├── pretrain.py └── utils ├── find_nn.py ├── gen_typos.py ├── infer.py ├── infer_avg.py ├── infer_ft_cbow.py ├── options.py └── similarity_search.py /.github/workflows/install.yml: -------------------------------------------------------------------------------- 1 | name: Install 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | install: 13 | # The type of runner that the job will run on 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-20.04] 19 | python-version: [3.6, 3.9] 20 | 21 | steps: 22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 23 | - uses: actions/checkout@v2 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Upgrade pip 29 | run: python -m pip install --upgrade pip setuptools wheel 30 | - name: Install dependencies 31 | run: pip install -e . 32 | -------------------------------------------------------------------------------- /.github/workflows/pretrain.yml: -------------------------------------------------------------------------------- 1 | name: Pretrain 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | test: 13 | # The type of runner that the job will run on 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-20.04] 19 | python-version: [3.6, 3.9] 20 | 21 | steps: 22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 23 | - uses: actions/checkout@v2 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Upgrade pip 29 | run: python -m pip install --upgrade pip setuptools wheel 30 | - name: Install dependencies 31 | run: pip install -e . 32 | - name: Disable wandb 33 | run: wandb disabled 34 | - name: Try pretrain avg 35 | run: python -m varclr.pretrain --model avg --name varclr-avg --epochs 1 --limit-train-batches 0.1 --gpu 0 36 | - name: Try pretrain lstm 37 | run: python -m varclr.pretrain --model lstm --name varclr-lstm --epochs 1 --limit-train-batches 0.1 --gpu 0 38 | - name: Try pretrain bert 39 | run: python -m varclr.pretrain --model bert --name varclr-codebert --epochs 1 --limit-train-batches 0.01 --sp-model split --last-n-layer-output 4 --batch-size 64 --lr 1e-5 --gpu 0 40 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | test: 13 | # The type of runner that the job will run on 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-20.04] 19 | python-version: [3.6, 3.9] 20 | 21 | steps: 22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 23 | - uses: actions/checkout@v2 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Upgrade pip 29 | run: python -m pip install --upgrade pip setuptools wheel 30 | - name: Install dependencies 31 | run: pip install -e . 32 | - name: Test reproduce paper results 33 | run: python -m pytest tests -v 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Pre-trained models 141 | saved/ 142 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: end-of-file-fixer 8 | - id: check-yaml 9 | - repo: https://github.com/pycqa/isort 10 | rev: 5.8.0 11 | hooks: 12 | - id: isort 13 | name: isort (python) 14 | - id: isort 15 | name: isort (cython) 16 | types: [cython] 17 | - id: isort 18 | name: isort (pyi) 19 | types: [pyi] 20 | - repo: https://github.com/psf/black 21 | rev: 21.10b0 22 | hooks: 23 | - id: black 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Qibin Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |     4 | 5 |
6 | 7 | --- 8 | 9 |
10 | 11 | [![Unittest](https://img.shields.io/github/actions/workflow/status/squaresLab/VarCLR/test.yml?branch=master)](https://github.com/squaresLab/VarCLR/actions/workflows/test.yml) 12 | [![GitHub stars](https://img.shields.io/github/stars/squaresLab/VarCLR)](https://github.com/squaresLab/VarCLR/stargazers) 13 | [![GitHub license](https://img.shields.io/github/license/squaresLab/varclr)](https://github.com/squaresLab/VarCLR/blob/master/LICENSE) 14 | [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 15 | 16 |
17 | 18 | ## [VarCLR: Variable Representation Pre-training via Contrastive Learning](https://arxiv.org/abs/2112.02650) 19 | 20 | **New:** Paper accepted by ICSE 2022. Preprint at [arXiv](https://arxiv.org/abs/2112.02650)! 21 | 22 | This repository contains code and pre-trained models for VarCLR, a contrastive learning based approach for learning semantic representations of variable names that effectively captures variable similarity, with state-of-the-art results on [IdBench@ICSE2021](https://conf.researchr.org/details/icse-2021/icse-2021-papers/3/IdBench-Evaluating-Semantic-Representations-of-Identifier-Names-in-Source-Code). 23 | 24 | - [VarCLR: Variable Representation Pre-training via Contrastive Learning](#varclr-variable-representation-pre-training-via-contrastive-learning) 25 | - [Step 0: Install](#step-0-install) 26 | - [Step 1: Load a Pre-trained VarCLR Model](#step-1-load-a-pre-trained-varclr-model) 27 | - [Step 2: VarCLR Variable Embeddings](#step-2-varclr-variable-embeddings) 28 | - [Get embedding of one variable](#get-embedding-of-one-variable) 29 | - [Get embeddings of list of variables (supports batching)](#get-embeddings-of-list-of-variables-supports-batching) 30 | - [Step 2: Get VarCLR Similarity Scores](#step-2-get-varclr-similarity-scores) 31 | - [Get similarity scores of N variable pairs](#get-similarity-scores-of-n-variable-pairs) 32 | - [Get pairwise (N * M) similarity scores from two lists of variables](#get-pairwise-n--m-similarity-scores-from-two-lists-of-variables) 33 | - [Step 3: Reproduce IdBench Benchmark Results](#step-3-reproduce-idbench-benchmark-results) 34 | - [Load the IdBench benchmark](#load-the-idbench-benchmark) 35 | - [Compute VarCLR scores and evaluate](#compute-varclr-scores-and-evaluate) 36 | - [Let's compare with the original CodeBERT](#lets-compare-with-the-original-codebert) 37 | - [Pre-train your own VarCLR models](#pre-train-your-own-varclr-models) 38 | - [Results on IdBench benchmarks](#results-on-idbench-benchmarks) 39 | - [Similarity](#similarity) 40 | - [Relatedness](#relatedness) 41 | - [Cite](#cite) 42 | 43 | ### Step 0: Install 44 | 45 | ```bash 46 | pip install -e . 47 | ``` 48 | 49 | ### Step 1: Load a Pre-trained VarCLR Model 50 | 51 | ```python 52 | from varclr.models.model import Encoder 53 | model = Encoder.from_pretrained("varclr-codebert") 54 | ``` 55 | 56 | ### Step 2: VarCLR Variable Embeddings 57 | 58 | #### Get embedding of one variable 59 | 60 | ```python 61 | emb = model.encode("squareslab") 62 | print(emb.shape) 63 | # torch.Size([1, 768]) 64 | ``` 65 | 66 | #### Get embeddings of list of variables (supports batching) 67 | 68 | ```python 69 | emb = model.encode(["squareslab", "strudel"]) 70 | print(emb.shape) 71 | # torch.Size([2, 768]) 72 | ``` 73 | 74 | ### Step 2: Get VarCLR Similarity Scores 75 | 76 | #### Get similarity scores of N variable pairs 77 | 78 | ```python 79 | print(model.score("squareslab", "strudel")) 80 | # [0.42812108993530273] 81 | print(model.score(["squareslab", "average", "max", "max"], ["strudel", "mean", "min", "maximum"])) 82 | # [0.42812108993530273, 0.8849745988845825, 0.8035818338394165, 0.889922022819519] 83 | ``` 84 | 85 | #### Get pairwise (N * M) similarity scores from two lists of variables 86 | 87 | ```python 88 | variable_list = ["squareslab", "strudel", "neulab"] 89 | print(model.cross_score("squareslab", variable_list)) 90 | # [[1.0000007152557373, 0.4281214475631714, 0.7207341194152832]] 91 | print(model.cross_score(variable_list, variable_list)) 92 | # [[1.0000007152557373, 0.4281214475631714, 0.7207341194152832], 93 | # [0.4281214475631714, 1.0000004768371582, 0.549992561340332], 94 | # [0.7207341194152832, 0.549992561340332, 1.000000238418579]] 95 | ``` 96 | 97 | ### Step 3: Reproduce IdBench Benchmark Results 98 | 99 | #### Load the IdBench benchmark 100 | 101 | ```python 102 | from varclr.benchmarks import Benchmark 103 | 104 | # Similarity on IdBench-Medium 105 | b1 = Benchmark.build("idbench", variant="medium", metric="similarity") 106 | # Relatedness on IdBench-Large 107 | b2 = Benchmark.build("idbench", variant="large", metric="relatedness") 108 | ``` 109 | 110 | #### Compute VarCLR scores and evaluate 111 | 112 | ```python 113 | id1_list, id2_list = b1.get_inputs() 114 | predicted = model.score(id1_list, id2_list) 115 | print(b1.evaluate(predicted)) 116 | # {'spearmanr': 0.5248567181503295, 'pearsonr': 0.5249843473193132} 117 | 118 | print(b2.evaluate(model.score(*b2.get_inputs()))) 119 | # {'spearmanr': 0.8012168379981921, 'pearsonr': 0.8021791703187449} 120 | ``` 121 | 122 | #### Let's compare with the original [CodeBERT](https://github.com/microsoft/CodeBERT) 123 | 124 | ```python 125 | codebert = Encoder.from_pretrained("codebert") 126 | print(b1.evaluate(codebert.score(*b1.get_inputs()))) 127 | # {'spearmanr': 0.2056582946575104, 'pearsonr': 0.1995058696927054} 128 | print(b2.evaluate(codebert.score(*b2.get_inputs()))) 129 | # {'spearmanr': 0.3909218857993804, 'pearsonr': 0.3378219622284688} 130 | ``` 131 | 132 | ### Pre-train your own VarCLR models 133 | 134 | You can pretrain and get the same VarCLR model variants with the following code. 135 | 136 | ```bash 137 | python -m varclr.pretrain --model avg --name varclr-avg 138 | python -m varclr.pretrain --model lstm --name varclr-lstm 139 | python -m varclr.pretrain --model bert --name varclr-codebert --sp-model split --last-n-layer-output 4 --batch-size 64 --lr 1e-5 --epochs 1 140 | ``` 141 | 142 | The training progress and test results will be presented in the wandb dashboard. For reference, our training curves look like the following: 143 | 144 | ![training progress](docs/_static/images/training.png) 145 | 146 | ### Results on [IdBench](https://conf.researchr.org/details/icse-2021/icse-2021-papers/3/IdBench-Evaluating-Semantic-Representations-of-Identifier-Names-in-Source-Code) benchmarks 147 | 148 | #### Similarity 149 | 150 | | Method | Small | Medium | Large | 151 | | ---------------- | -------- | -------- | -------- | 152 | | FT-SG | 0.30 | 0.29 | 0.28 | 153 | | LV | 0.32 | 0.30 | 0.30 | 154 | | FT-cbow | 0.35 | 0.38 | 0.38 | 155 | | VarCLR-Avg | 0.47 | 0.45 | 0.44 | 156 | | VarCLR-LSTM | 0.50 | 0.49 | 0.49 | 157 | | VarCLR-CodeBERT | **0.53** | **0.53** | **0.51** | 158 | | | | | | 159 | | Combined-IdBench | 0.48 | 0.59 | 0.57 | 160 | | Combined-VarCLR | **0.66** | **0.65** | **0.62** | 161 | 162 | #### Relatedness 163 | 164 | | Method | Small | Medium | Large | 165 | | ---------------- | -------- | -------- | -------- | 166 | | LV | 0.48 | 0.47 | 0.48 | 167 | | FT-SG | 0.70 | 0.71 | 0.68 | 168 | | FT-cbow | 0.72 | 0.74 | 0.73 | 169 | | VarCLR-Avg | 0.67 | 0.66 | 0.66 | 170 | | VarCLR-LSTM | 0.71 | 0.70 | 0.69 | 171 | | VarCLR-CodeBERT | **0.79** | **0.79** | **0.80** | 172 | | | | | | 173 | | Combined-IdBench | 0.71 | 0.78 | 0.79 | 174 | | Combined-VarCLR | **0.79** | **0.81** | **0.85** | 175 | 176 | ### Cite 177 | 178 | If you find VarCLR useful in your research, please cite our paper@ICSE2022: 179 | 180 | ```bibtex 181 | @inproceedings{ChenVarCLR2022, 182 | author = {Chen, Qibin and Lacomis, Jeremy and Schwartz, Edward J. and Neubig, Graham and Vasilescu, Bogdan and {Le~Goues}, Claire}, 183 | title = {{VarCLR}: {Variable} Semantic Representation Pre-training via Contrastive Learning}, 184 | booktitle = {International Conference on Software Engineering}, 185 | year = {2022}, 186 | series = {ICSE '22} 187 | } 188 | ``` 189 | -------------------------------------------------------------------------------- /cs-cs.var.tok.txt.codebert.vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/cs-cs.var.tok.txt.codebert.vocab -------------------------------------------------------------------------------- /docs/_static/images/squareslab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/squareslab.png -------------------------------------------------------------------------------- /docs/_static/images/strudel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/strudel.png -------------------------------------------------------------------------------- /docs/_static/images/training.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/training.jpeg -------------------------------------------------------------------------------- /docs/_static/images/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/training.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="varclr", 5 | version="1.0", 6 | author="Qibin Chen", 7 | author_email="qibinc@andrew.cmu.edu", 8 | license="MIT", 9 | python_requires=">=3.6", 10 | packages=find_packages(exclude=[]), 11 | install_requires=[ 12 | "black>=21.10b0", 13 | "gdown>=4.2.0", 14 | "isort>=5.8.0", 15 | "pandas>=1.1.0", 16 | "pre-commit>=2.15.0", 17 | "pytest>=6.2.4", 18 | "pytorch-lightning>=1.0.8,<1.3", 19 | "sentencepiece>=0.1.95", 20 | "scipy>=1.5.2", 21 | "torch>=1.7.1", 22 | "transformers==4.5.1", 23 | "wandb>=0.12.6", 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /tests/reproduce/test_idbench.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from varclr.benchmarks import Benchmark 4 | from varclr.models.model import Encoder 5 | 6 | 7 | def test_codebert(): 8 | model = Encoder.from_pretrained("varclr-codebert") 9 | 10 | paper_results = { 11 | ("small", "similarity"): 0.53, 12 | ("medium", "similarity"): 0.53, 13 | ("large", "similarity"): 0.51, 14 | ("small", "relatedness"): 0.79, 15 | ("medium", "relatedness"): 0.79, 16 | ("large", "relatedness"): 0.80, 17 | } 18 | for (variant, metric), expected in paper_results.items(): 19 | b = Benchmark.build("idbench", variant=variant, metric=metric) 20 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"] 21 | assert np.allclose(actual, expected, atol=1e-2) 22 | 23 | 24 | def test_avg(): 25 | model = Encoder.from_pretrained("varclr-avg") 26 | 27 | paper_results = { 28 | ("small", "similarity"): 0.47, 29 | ("medium", "similarity"): 0.45, 30 | ("large", "similarity"): 0.44, 31 | ("small", "relatedness"): 0.67, 32 | ("medium", "relatedness"): 0.66, 33 | ("large", "relatedness"): 0.66, 34 | } 35 | for (variant, metric), expected in paper_results.items(): 36 | b = Benchmark.build("idbench", variant=variant, metric=metric) 37 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"] 38 | assert np.allclose(actual, expected, atol=1e-2) 39 | 40 | 41 | def test_lstm(): 42 | model = Encoder.from_pretrained("varclr-lstm") 43 | 44 | paper_results = { 45 | ("small", "similarity"): 0.50, 46 | ("medium", "similarity"): 0.49, 47 | ("large", "similarity"): 0.49, 48 | ("small", "relatedness"): 0.71, 49 | ("medium", "relatedness"): 0.70, 50 | ("large", "relatedness"): 0.69, 51 | } 52 | for (variant, metric), expected in paper_results.items(): 53 | b = Benchmark.build("idbench", variant=variant, metric=metric) 54 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"] 55 | assert np.allclose(actual, expected, atol=1e-2) 56 | -------------------------------------------------------------------------------- /varclr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/varclr/__init__.py -------------------------------------------------------------------------------- /varclr/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from varclr.benchmarks.benchmark import Benchmark 2 | 3 | __all__ = ["Benchmark"] 4 | -------------------------------------------------------------------------------- /varclr/benchmarks/benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from typing import Dict, List, Tuple 4 | 5 | import pandas as pd 6 | from scipy.stats import pearsonr, spearmanr 7 | 8 | 9 | class Benchmark: 10 | @staticmethod 11 | def build(benchmark: str, **kwargs): 12 | return {"idbench": IdBench}[benchmark](**kwargs) 13 | 14 | def get_inputs(self): 15 | raise NotImplementedError 16 | 17 | def get_labels(self): 18 | raise NotImplementedError 19 | 20 | def evaluate(self, predictions) -> Dict[str, float]: 21 | raise NotImplementedError 22 | 23 | 24 | class IdBench(Benchmark): 25 | 26 | BASELINES = ["FT-cbow", "FT-SG", "w2v-SG", "w2v-cbow", "Path-based"] 27 | 28 | def __init__(self, variant: str, metric: str) -> None: 29 | super().__init__() 30 | assert variant in {"small", "medium", "large"} 31 | assert metric in {"similarity", "relatedness"} 32 | self.variant = variant 33 | self.metric = metric 34 | 35 | pairs = pd.read_csv( 36 | os.path.join( 37 | pathlib.Path(__file__).parent.resolve(), 38 | f"idbench/{self.variant}_pair_wise.csv", 39 | ), 40 | ) 41 | 42 | df = pairs[ 43 | pairs.apply( 44 | lambda r: r[self.metric] != "NAN" 45 | and all(r[b] != "NAN" for b in IdBench.BASELINES), 46 | axis=1, 47 | ) 48 | ] 49 | self.varlist1 = df["id1"].tolist() 50 | self.varlist2 = df["id2"].tolist() 51 | self.scores = df[self.metric].astype(float).tolist() 52 | 53 | def get_inputs(self) -> Tuple[List[str], List[str]]: 54 | return self.varlist1, self.varlist2 55 | 56 | def get_labels(self) -> List[float]: 57 | return self.scores 58 | 59 | def evaluate(self, predictions) -> Dict[str, float]: 60 | return { 61 | "spearmanr": spearmanr(predictions, self.scores).correlation, 62 | "pearsonr": pearsonr(predictions, self.scores)[0], 63 | } 64 | -------------------------------------------------------------------------------- /varclr/benchmarks/idbench/large_pair_wise.csv: -------------------------------------------------------------------------------- 1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW 2 | i,targ,0.3125,0.1718,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12 3 | expr,hint,0.1457,0.2155,NAN,0.11,0.42,0.21,0.25,0.36,0.0,0.5 4 | canvas,video,0.4167,0.7168,0.8173,0.48,0.57,0.49,0.24,0.78,0.0,0.42 5 | idx,indx,0.8542,0.9129,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75 6 | idx,ridx,0.4167,0.6948,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75 7 | right,bottom,0.078,0.7547,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42 8 | count,total,0.8088,0.8308,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6 9 | click,mousedown,0.7368,0.9448,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28 10 | change,keyup,0.3542,0.4987,NAN,0.55,0.65,0.54,0.55,0.81,0.0,0.42 11 | change,submit,0.327,0.4367,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5 12 | files,players,0.1168,0.1457,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5 13 | focus,resize,0.1332,0.3724,NAN,0.3,0.62,0.6,0.38,0.8,0.0,0.42 14 | reset,refresh,0.639,0.8112,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57 15 | pushStackLiteral,oldSelection,0.05,0.1807,0.231,0.13,0.27,0.32,0.31,0.55,0.12,0.44 16 | onAdd,onRemove,0.0192,0.9396,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44 17 | black,colours,0.077,0.8593,0.242,0.42,0.74,0.72,-0.12,0.53,0.14,0.43 18 | maroon,darkred,0.7115,0.819,NAN,0.45,0.73,0.71,0.15,0.84,0.29,0.57 19 | cosφ0,cosφ,0.625,0.891,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8 20 | allocate,contextmenu,0.1138,0.1441,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45 21 | response,alert,0.3087,0.6002,0.5999,0.33,0.53,0.46,0.14,0.31,0.0,0.31 22 | filename,fullname,0.1785,0.2903,NAN,0.65,0.65,0.7,0.11,0.72,0.75,0.88 23 | objects,records,0.6428,0.6825,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57 24 | ln,ilen,0.6965,0.7571,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5 25 | tasks,todos,0.9525,0.9378,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7 26 | images,authors,0.0418,0.2445,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5 27 | editable,dropdown,0.0588,0.2769,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5 28 | sources,adapters,0.375,0.5703,0.841,0.34,0.46,0.3,0.23,0.69,0.25,0.56 29 | ReactDOMComponent,ReactTextComponent,0.544,0.7231,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86 30 | λ0,φ0,0.2668,0.6339,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75 31 | xMin,xMax,0.0207,0.9783,0.3719,0.9,0.95,0.97,0.38,0.9,0.5,0.75 32 | FunctionExpression,FunctionDeclaration,0.2885,0.819,NAN,0.88,0.82,0.9,0.37,0.96,0.58,0.76 33 | Lines,CurRange,0.3077,0.3763,0.7382,0.58,0.68,0.87,0.46,0.39,0.12,0.38 34 | foundMap,foundStarMap,0.6923,0.819,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67 35 | columns,cols,0.9583,0.9419,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57 36 | dm,_queueHooks,0.15,0.198,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09 37 | fuchsia,pink,0.7345,0.8692,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29 38 | maxLine,maxLineLength,0.5833,0.8039,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54 39 | ExpressionStatement,FunctionDeclaration,0.3157,0.6423,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53 40 | addCls,removeCls,0.1457,0.8256,NAN,0.85,0.92,0.96,0.71,0.96,0.33,0.5 41 | object2,overlapOnly,0.0588,0.0154,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36 42 | nameSegment,foundStarMap,0.125,0.0411,NAN,0.37,0.6,0.89,0.8,0.76,0.08,0.5 43 | _selection,_sel,0.9808,0.9799,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4 44 | alignCenter,alignMiddle,0.625,0.8692,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73 45 | alignTop,popupLocationBar,0.0973,0.2592,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34 46 | targetFrame,targetFrameName,0.8035,0.8878,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73 47 | angle,radians,0.5312,0.8692,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43 48 | miny,ymin,0.9615,0.7989,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62 49 | element,dropdown,0.2917,0.477,0.7578,0.41,0.56,0.52,0.47,0.7,0.0,0.44 50 | equal,eql,0.9723,0.971,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6 51 | item,entry,0.7763,0.7798,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5 52 | events,rchecked,0.1427,0.1595,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44 53 | image,polyline,0.2223,0.3609,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38 54 | img,thumb,0.6528,0.7531,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4 55 | player,peer,0.45,0.5991,0.966,0.53,0.64,0.51,0.23,0.68,0.5,0.58 56 | files,profiles,0.2668,0.3549,0.5678,0.58,0.6,0.42,-0.01,0.77,0.62,0.62 57 | reset,clear,0.897,0.9077,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5 58 | username,userid,0.603,0.954,0.8685,0.72,0.79,0.67,0.15,0.65,0.5,0.62 59 | clear,refresh,0.4738,0.711,0.9717,0.55,0.66,0.61,0.35,0.81,0.14,0.43 60 | disabled,Tracker,0.05,0.1457,NAN,0.16,0.36,0.19,0.04,0.28,0.25,0.56 61 | olive,darkred,0.0938,0.6077,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43 62 | selectAnchor,anchorName,0.422,0.7385,0.9745,0.6,0.73,0.77,0.43,0.99,0.08,0.46 63 | names,filenames,0.6562,0.8039,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56 64 | setInterval,clearInterval,0.092,0.8624,0.3411,0.89,0.9,0.96,0.53,0.9,0.69,0.77 65 | getRules,foldingRules,0.375,0.6224,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54 66 | self_msgs,they_effects,0.0578,0.0748,0.2684,0.5,0.83,0.9,0.3,0.89,0.25,0.5 67 | getInstanceProp,setInstanceProp,0.0,0.8954,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97 68 | emptyText,blankText,0.8678,0.9077,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72 69 | minText,maxText,0.0735,0.9077,0.1279,0.95,0.9,0.93,0.65,0.99,0.71,0.86 70 | maxText,disabledDaysText,0.0167,0.0586,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38 71 | disabledDaysText,disabledDatesText,0.3905,0.8692,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91 72 | keywordMapper,buildinConstants,0.1332,0.1284,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41 73 | VM,invokePartial,0.0695,0.1284,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08 74 | blendMode,currentBlendMode,0.75,0.9038,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53 75 | touchmove,touchend,0.2778,0.8256,0.6156,0.91,0.94,0.96,0.27,0.97,0.56,0.72 76 | bindBuffer,ARRAY_BUFFER,0.3472,0.6658,0.625,0.51,0.81,0.96,0.72,0.65,0.08,0.46 77 | traverseContext,mapResult,0.3958,0.4553,NAN,0.36,0.58,0.84,0.25,0.68,0.2,0.4 78 | _owner,nextElement,0.0358,0.1595,0.1838,0.28,0.59,0.84,0.34,0.36,0.09,0.32 79 | m21,m22,0.1345,0.7385,NAN,0.98,0.96,0.95,0.48,0.94,0.67,0.83 80 | child,face,0.15,0.4247,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4 81 | displayMsg,emptyMsg,0.294,0.6925,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65 82 | pseudoElements,pseudoClasses,0.421,0.7246,0.851,0.71,0.75,0.78,0.74,0.99,0.57,0.75 83 | lastName,firstName,0.075,0.8954,0.3464,0.9,0.94,0.97,0.41,0.9,0.67,0.78 84 | Int16Array,Uint16Array,0.4625,0.9215,0.9195,0.91,0.91,0.94,0.18,0.83,0.82,0.86 85 | startSymbol,endSymbol,0.0375,0.9738,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68 86 | decrypt,ciphertext,0.3625,0.8039,0.8173,0.63,0.77,0.86,0.19,0.58,0.2,0.45 87 | rlocalProtocol,rurl,0.2738,0.5392,0.8272,0.66,0.6,0.67,0.47,0.91,0.21,0.25 88 | hSpace,popupFeatures,0.0375,0.1501,0.0,0.25,0.48,0.6,0.49,0.86,0.23,0.35 89 | linkTab,alignMiddle,0.1053,0.188,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45 90 | lockRatio,alignRight,0.0528,0.1467,0.231,0.47,0.45,0.57,0.13,0.74,0.2,0.55 91 | substr,substring,1.0,0.9419,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67 92 | columns,menus,0.327,0.5173,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5 93 | history,$ERROR,0.0155,0.1663,NAN,0.16,0.33,0.14,0.14,NAN,0.0,0.43 94 | deltaX,deltaY,0.2668,0.9129,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92 95 | MINUTE,SECOND,0.1405,0.9508,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5 96 | onDragStart,onDragEnd,0.1168,0.9825,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68 97 | body,agg,0.0938,0.1174,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38 98 | rows,pages,0.1833,0.5991,NAN,0.41,0.62,0.49,0.31,0.81,0.2,0.5 99 | store,storage,0.8333,0.9302,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71 100 | angle,theta,0.625,0.7571,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5 101 | foo,bar,0.5178,0.6077,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5 102 | DATE,MONTH,0.2308,0.9799,0.5943,0.65,0.75,0.83,0.34,0.89,0.2,0.5 103 | components,requests,0.0715,0.3277,NAN,0.34,0.44,0.26,0.22,0.77,0.3,0.55 104 | modal,calendar,0.1095,0.1663,NAN,0.27,0.44,0.33,0.16,0.6,0.25,0.44 105 | ids,tasks,0.3333,0.4595,NAN,0.38,0.58,0.31,0.26,0.8,0.2,0.4 106 | toolbar,preview,0.0333,0.0934,NAN,0.35,0.58,0.77,0.4,0.78,0.0,0.5 107 | $behaviour,foldingRules,0.1972,0.2568,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46 108 | a01,b01,0.4062,0.6893,NAN,0.85,0.94,0.9,0.44,0.94,0.67,0.83 109 | material,light,0.0657,0.2843,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31 110 | camera,texture,0.0832,0.2735,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5 111 | user,person,0.8947,0.8624,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42 112 | selected,active,0.7763,0.8486,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44 113 | rows,columns,0.0832,0.8839,0.2155,0.72,0.78,0.81,0.34,0.86,0.29,0.43 114 | files,problems,0.0555,0.0557,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5 115 | frames,markers,0.25,0.5497,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57 116 | objects,shortcuts,0.111,0.1428,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56 117 | cx,sx,0.3678,0.3847,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75 118 | tr,td,0.1,0.7385,NAN,0.81,0.82,0.8,0.56,0.77,0.5,0.75 119 | foo,trow,0.2678,0.1221,0.7743,0.22,0.62,-0.02,0.17,0.35,0.25,0.5 120 | bindBuffer,ELEMENT_ARRAY_BUFFER,0.2648,0.5078,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28 121 | navy,purple,0.2333,0.7037,0.5563,0.37,0.84,0.9,0.42,0.95,0.0,0.33 122 | host,author,0.5683,0.5722,NAN,0.38,0.54,0.21,0.13,0.57,0.17,0.33 123 | li,span,0.159,0.2869,NAN,0.53,0.73,0.59,0.37,0.82,0.0,0.25 124 | orange,pink,0.1345,0.9195,NAN,0.21,0.73,0.75,0.49,0.88,0.17,0.42 125 | CallExpression,BlockStatement,0.2142,0.4022,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5 126 | g,r,0.423,0.3363,0.8674,0.53,0.84,0.75,0.68,0.71,0.0,0.5 127 | data,azimuthal,0.206,0.154,0.401,0.28,0.45,0.14,0.02,0.44,0.22,0.33 128 | raw,movie,0.1527,0.3316,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3 129 | me,br,0.1322,0.1232,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5 130 | err,er,0.7115,0.7586,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67 131 | res,resp,0.6042,0.6077,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75 132 | content,newtext,0.3393,0.477,0.9284,0.35,0.57,0.34,0.22,0.51,0.43,0.71 133 | utils,util,0.8088,0.9846,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8 134 | renderer,screen,0.423,0.6177,0.8453,0.28,0.46,0.35,0.19,0.64,0.25,0.5 135 | maroon,olive,0.0535,0.5703,NAN,0.31,0.82,0.95,0.64,0.9,0.0,0.42 136 | olive,pink,0.044,0.5693,NAN,0.31,0.78,0.87,0.54,0.88,0.2,0.5 137 | λ0,λ1,0.2668,0.7385,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75 138 | arrayClass,boolClass,0.2353,0.677,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7 139 | paddingRight,paddingTop,0.0455,1.0,NAN,0.89,0.88,0.88,0.28,0.94,0.58,0.71 140 | expect,bp,0.05,0.0238,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25 141 | items,files,0.625,0.5892,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6 142 | disabled,visible,0.103,0.5384,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75 143 | round,sqrt,0.0578,0.5173,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4 144 | teal,lightgrey,0.25,0.7845,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28 145 | navy,lightblue,0.4117,0.8923,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22 146 | navy,lightgreen,0.125,0.6514,NAN,0.31,0.75,0.89,0.47,0.94,0.0,0.2 147 | navy,magenta,0.1765,0.8617,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36 148 | items,records,0.7345,0.8039,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43 149 | hide,blur,0.2708,0.5424,NAN,0.5,0.71,0.59,0.39,0.69,0.0,0.5 150 | VERSION,geoJson,0.0795,0.1085,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5 151 | Util,isParam,0.1,0.2155,NAN,0.27,0.52,0.29,0.35,0.4,0.0,0.29 152 | found,rawFunc,0.05,0.0934,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5 153 | a01,b11,0.175,0.4247,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67 154 | gray,silver,0.4667,0.9129,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33 155 | topLevelTarget,topLevelTargetID,0.6538,0.8392,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88 156 | defHeaders,defHeaderName,0.5312,0.9184,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73 157 | get,facets,0.1875,0.2155,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42 158 | equal,cut,0.0385,0.0947,0.2049,0.09,0.51,0.09,0.15,0.34,0.2,0.4 159 | start,begin,0.9423,0.9799,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5 160 | visible,showing,0.829,0.9038,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5 161 | dataMax,dataMin,0.125,0.9738,NAN,0.94,0.96,0.97,0.6,0.97,0.71,0.86 162 | len,ls,0.1965,0.3088,NAN,0.49,0.71,0.52,0.4,0.46,0.33,0.5 163 | items,ranges,0.25,0.3562,0.7983,0.24,0.55,0.38,0.18,0.78,0.17,0.5 164 | dispatchIDs,_dispatchIDs,0.828,0.9184,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92 165 | maroon,lightblue,0.0155,0.6731,0.1838,0.45,0.77,0.78,0.37,0.88,0.0,0.33 166 | remove,focus,0.0395,0.0502,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42 167 | b01,a03,0.1537,0.4569,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67 168 | getAnimation,depot,0.0735,0.0309,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29 169 | translateX,translateY,0.2293,0.9563,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95 170 | mountDepth,updateComponent,0.0625,0.063,0.2915,0.38,0.5,0.85,0.31,0.48,0.13,0.4 171 | left,bottom,0.125,0.6862,NAN,0.63,0.82,0.84,0.73,0.91,0.17,0.42 172 | nodes,filenames,0.2223,0.448,NAN,0.39,0.46,0.39,-0.08,0.63,0.33,0.44 173 | records,entries,0.8685,0.8624,1.0,0.41,0.53,0.52,0.14,0.76,0.29,0.57 174 | adjusted_scale,rangy,0.3077,0.4367,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18 175 | PLACEHOLDER,vertexFormat,0.1168,0.1807,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46 176 | vSpace,advisoryTitleInputLabel,0.0695,0.0557,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22 177 | styleSelectLabel,tag_h4,0.2812,0.4116,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22 178 | body,currants,0.1155,0.1953,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25 179 | alpha,rate,0.1138,0.1917,NAN,0.37,0.59,0.49,0.27,0.71,0.0,0.4 180 | indices,positions,0.6965,0.8133,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5 181 | keyup,deferId,0.1168,0.1109,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43 182 | getBorderWidth,getPadding,0.5295,0.8462,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54 183 | foo,abc,0.5833,0.2155,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5 184 | files,images,0.5,0.7385,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58 185 | layers,entries,0.2668,0.3724,0.7488,0.38,0.5,0.44,0.22,0.67,0.14,0.5 186 | generalTab,advancedTab,0.389,0.8839,0.0,0.63,0.76,0.91,0.77,0.98,0.27,0.59 187 | targetPopup,popupResizable,0.328,0.6569,0.6966,0.51,0.63,0.85,0.45,NAN,0.07,0.43 188 | start,searches,0.1168,0.1284,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5 189 | a24,a30,0.3462,0.7385,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67 190 | push,configure,0.047,0.1501,NAN,0.06,0.49,0.17,0.14,0.55,0.11,0.28 191 | len,ln,0.794,0.8462,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67 192 | left,top,0.05,0.7646,NAN,0.8,0.88,0.91,0.81,0.98,0.0,0.38 193 | lightgreen,lightgrey,0.1138,0.6433,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85 194 | self,that,0.297,0.4932,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5 195 | y,z,0.4265,0.477,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5 196 | push,ts,0.111,0.1138,NAN,0.12,0.56,0.29,0.01,0.24,0.25,0.38 197 | element,elm,0.8678,0.9231,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43 198 | id,userid,0.7648,0.8308,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33 199 | id,sessionid,0.7205,0.8462,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22 200 | expect,assume,0.6765,0.7539,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5 201 | callback,cb,0.8595,0.8692,NAN,0.65,0.7,0.78,0.48,0.83,0.25,0.25 202 | container,video,0.2082,0.3172,NAN,0.29,0.5,0.21,0.18,0.7,0.22,0.39 203 | container,submenu,0.3595,0.4278,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44 204 | children,columns,0.2833,0.3201,0.7983,0.42,0.54,0.46,0.2,0.73,0.25,0.56 205 | items,tiles,0.3125,0.4116,NAN,0.28,0.55,0.35,0.27,0.79,0.4,0.6 206 | equal,ok,0.3195,0.3753,0.9052,0.7,0.8,0.67,0.65,0.76,0.0,0.2 207 | miny,ypos,0.375,0.6825,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5 208 | newLength,Group,0.0418,0.0267,0.1606,0.15,0.41,0.1,0.13,0.32,0.0,0.28 209 | colspan,assignTo,0.0625,0.0847,0.29,0.1,0.39,0.21,0.22,0.08,0.12,0.44 210 | setScrollTop,prefixWith,0.0795,0.1323,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42 211 | getMinutes,getUTCMinutes,0.725,0.9085,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77 212 | FONTDATA,FONTS,0.578,0.853,0.8931,0.92,0.92,0.95,0.54,0.75,0.5,0.56 213 | ReactEmptyComponent,renderToStaticMarkup,0.1073,0.091,0.3695,0.6,0.65,0.65,0.22,0.55,0.1,0.52 214 | images,streams,0.25,0.586,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5 215 | s1,s2,0.3552,0.7246,0.7833,0.97,0.96,0.99,0.56,0.89,0.5,0.75 216 | precondition,prereq,0.8125,0.8692,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38 217 | setAttributeNode,createAttribute,0.3678,0.7076,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62 218 | Connection,Client,0.2812,0.4608,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45 219 | dom__FixedSizeListIterator,_dom_pos,0.2655,0.5586,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21 220 | v2,v3,0.297,0.8039,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75 221 | trim,separatedList,0.163,0.2837,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23 222 | minVal,minValue,0.908,0.9587,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75 223 | btnModify,btnSetValue,0.6072,0.7385,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55 224 | nodeLinkFn,childLinkFn,0.75,0.8593,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73 225 | setMinutes,setSeconds,0.2205,0.9077,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7 226 | obj1,obj2,0.6095,0.8039,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88 227 | destroy,clear,0.6,0.8083,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43 228 | panelTitle,panelTitle1,0.8815,0.9587,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91 229 | __dependency1__,__dependency2__,0.4643,0.9378,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97 230 | ace,__ace_shadowed__,0.5,0.7945,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19 231 | m1,m2,0.5417,0.8112,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75 232 | latitude,longitude,0.0147,0.9385,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78 233 | item,record,0.6765,0.7694,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33 234 | cols,rects,0.173,0.4166,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5 235 | resetSize,popupFeatures,0.0418,0.1428,0.231,0.13,0.49,0.58,0.29,0.75,0.15,0.42 236 | parentWindow,scanner,0.0735,0.0463,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38 237 | popupToolbar,popupDependent,0.25,0.6077,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61 238 | cursor,intercept,0.15,0.2503,NAN,0.2,0.35,0.23,0.08,0.33,0.11,0.39 239 | split,shim,0.264,0.3316,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6 240 | ranges,codes,0.1818,0.3818,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58 241 | prereq,fnExists,0.1668,0.1109,NAN,0.34,0.76,0.91,0.67,0.01,0.0,0.38 242 | styleSelectLabel,cssClassInputLabel,0.5625,0.6077,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67 243 | ELEMENT_NODE,TEXT_NODE,0.5148,0.7999,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67 244 | rlocalProtocol,rprotocol,0.7083,0.8839,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61 245 | listeners,cbs,0.1922,0.0947,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22 246 | ranges,expressions,0.175,0.3724,NAN,0.38,0.46,0.45,0.17,0.78,0.18,0.36 247 | segments,annotations,0.2655,0.3135,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45 248 | onPlay,onPause,0.1618,0.8768,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64 249 | clientY,_keyStr,0.1168,0.1109,NAN,0.2,0.41,0.21,0.07,0.4,0.14,0.57 250 | oMatchesSelector,msMatchesSelector,0.6562,0.8692,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91 251 | re,destruct,0.0207,0.063,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19 252 | click,dblclick,0.206,0.9691,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62 253 | columns,requests,0.125,0.1663,0.5999,0.24,0.49,0.31,0.17,0.75,0.25,0.56 254 | a11,b12,0.1945,0.6804,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67 255 | pink,brown,0.0938,0.6893,0.2204,0.38,0.74,0.82,0.63,0.91,0.0,0.4 256 | floor,abs,0.1427,0.3462,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3 257 | math,miny,0.0177,0.3836,0.1581,0.47,0.63,0.44,0.27,0.35,0.25,0.62 258 | alignTop,popupToolbar,0.147,0.2001,NAN,0.39,0.47,0.52,0.57,NAN,0.17,0.42 259 | oDomRef,structuralType,0.0938,0.0194,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25 260 | dataAndEvents,deepDataAndEvents,0.7368,0.8624,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76 261 | toggle,isLength,0.0147,0.0,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44 262 | defaults,DomRange,0.1332,0.2155,NAN,0.11,0.34,0.16,-0.07,0.44,0.0,0.5 263 | advice,alternative,0.0695,0.1284,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41 264 | controller,mutator,0.3125,0.3789,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45 265 | files,locations,0.4465,0.4396,0.8585,0.25,0.46,0.3,-0.04,0.82,0.22,0.39 266 | click,mouseup,0.4305,0.8692,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36 267 | inputs,resources,0.4167,0.6339,0.7932,0.29,0.45,0.32,-0.03,0.78,0.22,0.44 268 | styleSelectLabel,inlineStyleInputLabel,0.5625,0.6569,NAN,0.72,0.74,0.91,0.24,NAN,0.48,0.62 269 | a12,a13,0.5,0.8133,NAN,0.96,0.97,0.96,0.5,0.98,0.67,0.83 270 | abs,sqrt,0.0177,0.6825,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38 271 | cells,segments,0.5938,0.6893,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44 272 | place,written,0.047,0.1009,0.3411,0.15,0.41,0.2,0.11,0.42,0.14,0.43 273 | on,immediate,0.175,0.1893,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11 274 | links,associations,0.6667,0.8039,NAN,0.33,0.42,0.42,-0.01,0.72,0.17,0.29 275 | names,sources,0.578,0.6077,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5 276 | minx,ymax,0.1168,0.6862,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62 277 | cols,domains,0.25,0.4294,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43 278 | ELEMENT_ARRAY_BUFFER,bufferData,0.5312,0.8366,0.875,0.51,0.75,0.9,0.52,0.55,0.0,0.25 279 | equals,same,0.8333,0.9346,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42 280 | nextTick,notification,0.1345,0.2356,NAN,0.32,0.43,0.39,0.02,0.33,0.25,0.46 281 | cfg,conf,0.9445,0.8546,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5 282 | wrapper,scroller,0.3845,0.477,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62 283 | userA,userB,0.4605,0.9174,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9 284 | columns,datasets,0.1875,0.4278,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5 285 | limit,exponent,0.1537,0.3562,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38 286 | onTouchStart,onTouchMove,0.4545,0.7861,0.9397,0.88,0.92,0.95,0.33,0.96,0.58,0.75 287 | sender,timeEnd,0.0625,0.1064,NAN,0.12,0.45,0.34,-0.11,0.43,0.14,0.43 288 | scaleX,scaleY,0.1668,0.9783,NAN,0.97,0.97,0.99,0.61,0.95,0.83,0.92 289 | ui,secret,0.0735,0.0463,0.3489,0.05,0.5,0.09,0.33,0.54,0.0,0.17 290 | builtinConstants,buildinConstants,0.6315,0.8073,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97 291 | objects,images,NAN,NAN,0.8986,0.33,0.58,0.44,-0.21,0.73,0.14,0.5 292 | items,images,NAN,NAN,0.8436,0.39,0.6,0.4,-0.09,0.83,0.33,0.58 293 | items,links,NAN,NAN,0.7983,0.39,0.57,0.52,-0.06,0.75,0.2,0.6 294 | canvas,image,NAN,NAN,0.9397,0.47,0.68,0.6,0.59,0.71,0.0,0.42 295 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56 296 | margin,padding,NAN,NAN,0.9369,0.62,0.74,0.78,0.62,0.87,0.43,0.64 297 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64 298 | -------------------------------------------------------------------------------- /varclr/benchmarks/idbench/medium_pair_wise.csv: -------------------------------------------------------------------------------- 1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW 2 | i,targ,0.341,0.1963,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12 3 | canvas,video,0.3638,0.6927,NAN,0.48,0.57,0.49,0.24,0.78,0.0,0.42 4 | idx,indx,0.9318,0.929,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75 5 | idx,ridx,0.4317,0.669,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75 6 | right,bottom,0.0832,0.7574,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42 7 | count,total,0.8125,0.8211,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6 8 | click,mousedown,0.7812,0.935,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28 9 | change,keyup,0.3542,0.5016,NAN,0.55,0.65,0.54,0.55,0.81,0.0,0.42 10 | change,submit,0.3542,0.48,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5 11 | files,players,0.125,0.1456,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5 12 | focus,resize,0.1457,0.3716,NAN,0.3,0.62,0.6,0.38,0.8,0.0,0.42 13 | reset,refresh,0.6178,0.8011,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57 14 | pushStackLiteral,oldSelection,0.0385,0.1599,0.231,0.13,0.27,0.32,0.31,0.55,0.12,0.44 15 | onAdd,onRemove,0.0207,0.9566,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44 16 | black,colours,0.0832,0.87,0.242,0.42,0.74,0.72,-0.12,0.53,0.14,0.43 17 | cosφ0,cosφ,0.591,0.8817,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8 18 | allocate,contextmenu,0.1,0.142,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45 19 | response,alert,0.2082,0.5884,NAN,0.33,0.53,0.46,0.14,0.31,0.0,0.31 20 | filename,fullname,0.15,0.246,NAN,0.65,0.65,0.7,0.11,0.72,0.75,0.88 21 | ln,ilen,0.6923,0.76,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5 22 | tasks,todos,0.9823,0.9444,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7 23 | images,authors,0.0167,0.3066,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5 24 | editable,dropdown,0.0578,0.3201,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5 25 | sources,adapters,0.4,0.532,NAN,0.34,0.46,0.3,0.23,0.69,0.25,0.56 26 | ReactDOMComponent,ReactTextComponent,0.5192,0.72,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86 27 | λ0,φ0,0.175,0.61,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75 28 | xMin,xMax,0.0227,0.9763,NAN,0.9,0.95,0.97,0.38,0.9,0.5,0.75 29 | FunctionExpression,FunctionDeclaration,0.3125,0.8484,NAN,0.88,0.82,0.9,0.37,0.96,0.58,0.76 30 | Lines,CurRange,0.325,0.428,NAN,0.58,0.68,0.87,0.46,0.39,0.12,0.38 31 | foundMap,foundStarMap,0.7083,0.8266,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67 32 | columns,cols,0.9667,0.9654,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57 33 | dm,_queueHooks,0.1138,0.149,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09 34 | fuchsia,pink,0.75,0.9134,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29 35 | maxLine,maxLineLength,0.5833,0.805,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54 36 | ExpressionStatement,FunctionDeclaration,0.3055,0.6534,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53 37 | addCls,removeCls,0.159,0.8346,NAN,0.85,0.92,0.96,0.71,0.96,0.33,0.5 38 | object2,overlapOnly,0.0667,0.0294,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36 39 | _selection,_sel,0.9772,0.9763,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4 40 | alignCenter,alignMiddle,0.659,0.8346,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73 41 | alignTop,popupLocationBar,0.125,0.2943,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34 42 | targetFrame,targetFrameName,0.8638,0.9054,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73 43 | angle,radians,0.6,0.922,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43 44 | miny,ymin,1.0,0.811,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62 45 | equal,eql,0.9823,0.9815,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6 46 | item,entry,0.8333,0.792,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5 47 | events,rchecked,0.1362,0.149,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44 48 | image,polyline,0.2205,0.3729,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38 49 | img,thumb,0.7,0.8094,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4 50 | player,peer,0.4545,0.669,NAN,0.53,0.64,0.51,0.23,0.68,0.5,0.58 51 | files,profiles,0.159,0.2436,NAN,0.58,0.6,0.42,-0.01,0.77,0.62,0.62 52 | reset,clear,0.8845,0.8999,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5 53 | username,userid,0.6,0.9654,0.8685,0.72,0.79,0.67,0.15,0.65,0.5,0.62 54 | clear,refresh,0.45,0.7054,NAN,0.55,0.66,0.61,0.35,0.81,0.14,0.43 55 | disabled,Tracker,0.025,0.194,NAN,0.16,0.36,0.19,0.04,0.28,0.25,0.56 56 | olive,darkred,0.0715,0.61,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43 57 | selectAnchor,anchorName,0.3125,0.675,NAN,0.6,0.73,0.77,0.43,0.99,0.08,0.46 58 | names,filenames,0.625,0.7956,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56 59 | setInterval,clearInterval,0.0715,0.9072,0.3411,0.89,0.9,0.96,0.53,0.9,0.69,0.77 60 | getRules,foldingRules,0.3845,0.6799,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54 61 | self_msgs,they_effects,0.0682,0.0546,0.2684,0.5,0.83,0.9,0.3,0.89,0.25,0.5 62 | getInstanceProp,setInstanceProp,0.0,0.9199,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97 63 | emptyText,blankText,0.8927,0.8885,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72 64 | minText,maxText,0.0,0.9628,0.1279,0.95,0.9,0.93,0.65,0.99,0.71,0.86 65 | maxText,disabledDaysText,0.0,0.0684,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38 66 | disabledDaysText,disabledDatesText,0.3638,0.8817,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91 67 | keywordMapper,buildinConstants,0.0625,0.09,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41 68 | VM,invokePartial,0.0333,0.1334,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08 69 | blendMode,currentBlendMode,0.75,0.9784,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53 70 | touchmove,touchend,0.2167,0.844,NAN,0.91,0.94,0.96,0.27,0.97,0.56,0.72 71 | bindBuffer,ARRAY_BUFFER,0.2812,0.6425,0.625,0.51,0.81,0.96,0.72,0.65,0.08,0.46 72 | traverseContext,mapResult,0.4,0.48,NAN,0.36,0.58,0.84,0.25,0.68,0.2,0.4 73 | _owner,nextElement,0.0227,0.0546,0.1838,0.28,0.59,0.84,0.34,0.36,0.09,0.32 74 | m21,m22,0.159,0.7637,NAN,0.98,0.96,0.95,0.48,0.94,0.67,0.83 75 | child,face,0.1073,0.3872,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4 76 | displayMsg,emptyMsg,0.1875,0.6966,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65 77 | pseudoElements,pseudoClasses,0.4545,0.811,NAN,0.71,0.75,0.78,0.74,0.99,0.57,0.75 78 | lastName,firstName,0.0555,0.8989,0.3464,0.9,0.94,0.97,0.41,0.9,0.67,0.78 79 | Int16Array,Uint16Array,0.4705,0.9542,NAN,0.91,0.91,0.94,0.18,0.83,0.82,0.86 80 | startSymbol,endSymbol,0.044,0.9847,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68 81 | decrypt,ciphertext,0.3685,0.7949,NAN,0.63,0.77,0.86,0.19,0.58,0.2,0.45 82 | rlocalProtocol,rurl,0.3027,0.5621,NAN,0.66,0.6,0.67,0.47,0.91,0.21,0.25 83 | hSpace,popupFeatures,0.0155,0.1225,0.0,0.25,0.48,0.6,0.49,0.86,0.23,0.35 84 | linkTab,alignMiddle,0.0892,0.1643,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45 85 | lockRatio,alignRight,0.0192,0.1599,0.231,0.47,0.45,0.57,0.13,0.74,0.2,0.55 86 | substr,substring,1.0,0.9306,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67 87 | columns,menus,0.3862,0.551,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5 88 | history,$ERROR,0.0207,0.1984,NAN,0.16,0.33,0.14,0.14,NAN,0.0,0.43 89 | deltaX,deltaY,0.2857,0.9072,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92 90 | MINUTE,SECOND,0.125,0.9566,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5 91 | onDragStart,onDragEnd,0.1345,0.98,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68 92 | body,agg,0.0832,0.1115,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38 93 | rows,pages,0.1668,0.5234,NAN,0.41,0.62,0.49,0.31,0.81,0.2,0.5 94 | store,storage,0.8333,0.9306,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71 95 | angle,theta,0.75,0.811,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5 96 | foo,bar,0.5208,0.6316,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5 97 | DATE,MONTH,0.225,1.0,0.5943,0.65,0.75,0.83,0.34,0.89,0.2,0.5 98 | modal,calendar,0.0207,0.025,NAN,0.27,0.44,0.33,0.16,0.6,0.25,0.44 99 | ids,tasks,0.325,0.48,NAN,0.38,0.58,0.31,0.26,0.8,0.2,0.4 100 | orange,pink,0.159,0.929,NAN,0.21,0.73,0.75,0.49,0.88,0.17,0.42 101 | CallExpression,BlockStatement,0.2045,0.409,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5 102 | data,azimuthal,0.173,0.1599,0.401,0.28,0.45,0.14,0.02,0.44,0.22,0.33 103 | raw,movie,0.1,0.3066,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3 104 | expect,bp,0.05,0.0294,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25 105 | utils,util,0.75,0.9763,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8 106 | items,files,0.6042,0.5884,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6 107 | disabled,visible,0.125,0.6284,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75 108 | round,sqrt,0.0418,0.545,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4 109 | teal,lightgrey,0.25,0.87,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28 110 | navy,lightblue,0.4167,0.9306,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22 111 | navy,lightgreen,0.125,0.74,NAN,0.31,0.75,0.89,0.47,0.94,0.0,0.2 112 | navy,magenta,0.2115,0.8401,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36 113 | equal,cut,0.0455,0.1253,0.2049,0.09,0.51,0.09,0.15,0.34,0.2,0.4 114 | start,begin,0.9375,0.9784,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5 115 | renderer,screen,0.475,0.662,NAN,0.28,0.46,0.35,0.19,0.64,0.25,0.5 116 | visible,showing,0.8382,0.9082,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5 117 | foo,trow,0.2082,0.0684,NAN,0.22,0.62,-0.02,0.17,0.35,0.25,0.5 118 | arrayClass,boolClass,0.1965,0.6472,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7 119 | me,br,0.159,0.1017,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5 120 | items,records,0.725,0.844,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43 121 | items,ranges,0.2045,0.3383,NAN,0.24,0.55,0.38,0.18,0.78,0.17,0.5 122 | bindBuffer,ELEMENT_ARRAY_BUFFER,0.25,0.48,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28 123 | dispatchIDs,_dispatchIDs,0.8215,0.9628,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92 124 | maroon,lightblue,0.0227,0.7163,0.1838,0.45,0.77,0.78,0.37,0.88,0.0,0.33 125 | remove,focus,0.0578,0.0601,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42 126 | getAnimation,depot,0.0385,0.04,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29 127 | nodes,filenames,0.2333,0.4974,NAN,0.39,0.46,0.39,-0.08,0.63,0.33,0.44 128 | cx,sx,0.3832,0.4106,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75 129 | records,entries,0.9062,0.8861,1.0,0.41,0.53,0.52,0.14,0.76,0.29,0.57 130 | adjusted_scale,rangy,0.3333,0.48,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18 131 | PLACEHOLDER,vertexFormat,0.075,0.09,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46 132 | vSpace,advisoryTitleInputLabel,0.0832,0.0814,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22 133 | λ0,λ1,0.2,0.74,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75 134 | start,searches,0.1155,0.1399,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5 135 | navy,purple,0.225,0.688,NAN,0.37,0.84,0.9,0.42,0.95,0.0,0.33 136 | olive,pink,0.0625,0.5234,NAN,0.31,0.78,0.87,0.54,0.88,0.2,0.5 137 | push,configure,0.0625,0.1334,NAN,0.06,0.49,0.17,0.14,0.55,0.11,0.28 138 | len,ln,0.8125,0.8539,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67 139 | left,top,0.05,0.766,NAN,0.8,0.88,0.91,0.81,0.98,0.0,0.38 140 | self,that,0.3035,0.4615,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5 141 | y,z,0.4423,0.5801,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5 142 | element,elm,0.8655,0.9399,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43 143 | id,userid,0.7885,0.8401,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33 144 | id,sessionid,0.7917,0.935,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22 145 | equal,ok,0.3333,0.3934,NAN,0.7,0.8,0.67,0.65,0.76,0.0,0.2 146 | container,video,0.2293,0.415,NAN,0.29,0.5,0.21,0.18,0.7,0.22,0.39 147 | miny,ypos,0.2955,0.6927,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5 148 | newLength,Group,0.0295,0.0211,0.1606,0.15,0.41,0.1,0.13,0.32,0.0,0.28 149 | colspan,assignTo,0.0555,0.0754,0.29,0.1,0.39,0.21,0.22,0.08,0.12,0.44 150 | setScrollTop,prefixWith,0.0832,0.1456,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42 151 | getMinutes,getUTCMinutes,0.75,0.9277,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77 152 | FONTDATA,FONTS,0.5667,0.8614,0.8931,0.92,0.92,0.95,0.54,0.75,0.5,0.56 153 | ReactEmptyComponent,renderToStaticMarkup,0.0973,0.09,NAN,0.6,0.65,0.65,0.22,0.55,0.1,0.52 154 | images,streams,0.25,0.636,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5 155 | VERSION,geoJson,0.075,0.09,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5 156 | s1,s2,0.422,0.74,0.7833,0.97,0.96,0.99,0.56,0.89,0.5,0.75 157 | precondition,prereq,0.85,0.9134,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38 158 | setAttributeNode,createAttribute,0.4,0.7746,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62 159 | Connection,Client,0.25,0.4399,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45 160 | dom__FixedSizeListIterator,_dom_pos,0.3077,0.5801,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21 161 | v2,v3,0.3077,0.8001,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75 162 | container,submenu,0.3333,0.4366,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44 163 | children,columns,0.341,0.409,NAN,0.42,0.54,0.46,0.2,0.73,0.25,0.56 164 | items,tiles,0.3542,0.4584,NAN,0.28,0.55,0.35,0.27,0.79,0.4,0.6 165 | trim,separatedList,0.1912,0.3271,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23 166 | minVal,minValue,0.95,0.948,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75 167 | push,ts,0.1043,0.1115,NAN,0.12,0.56,0.29,0.01,0.24,0.25,0.38 168 | expect,assume,0.7708,0.805,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5 169 | btnModify,btnSetValue,0.5962,0.72,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55 170 | nodeLinkFn,childLinkFn,0.75,0.8601,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73 171 | setMinutes,setSeconds,0.2167,0.896,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7 172 | obj1,obj2,0.65,0.792,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88 173 | destroy,clear,0.6138,0.7873,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43 174 | panelTitle,panelTitle1,0.9167,0.9654,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91 175 | __dependency1__,__dependency2__,0.4605,0.9316,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97 176 | ace,__ace_shadowed__,0.5,0.818,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19 177 | m1,m2,0.5735,0.8625,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75 178 | latitude,longitude,0.0167,0.9306,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78 179 | item,record,0.6667,0.7746,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33 180 | cols,rects,0.2045,0.4563,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5 181 | resetSize,popupFeatures,0.05,0.1854,0.231,0.13,0.49,0.58,0.29,0.75,0.15,0.42 182 | callback,cb,0.8655,0.8601,NAN,0.65,0.7,0.78,0.48,0.83,0.25,0.25 183 | parentWindow,scanner,0.0715,0.0343,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38 184 | targetPopup,popupResizable,0.2885,0.5801,NAN,0.51,0.63,0.85,0.45,NAN,0.07,0.43 185 | popupToolbar,popupDependent,0.1785,0.5915,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61 186 | cursor,intercept,0.05,0.22,NAN,0.2,0.35,0.23,0.08,0.33,0.11,0.39 187 | split,shim,0.2917,0.3934,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6 188 | ranges,codes,0.2,0.428,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58 189 | generalTab,advancedTab,0.375,0.87,0.0,0.63,0.76,0.91,0.77,0.98,0.27,0.59 190 | styleSelectLabel,cssClassInputLabel,0.6138,0.669,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67 191 | getBorderWidth,getPadding,0.4822,0.8328,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54 192 | ELEMENT_NODE,TEXT_NODE,0.5,0.805,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67 193 | rlocalProtocol,rprotocol,0.7,0.896,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61 194 | listeners,cbs,0.125,0.142,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22 195 | tr,td,0.0455,0.811,NAN,0.81,0.82,0.8,0.56,0.77,0.5,0.75 196 | user,person,0.8845,0.8401,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42 197 | onPlay,onPause,0.172,0.87,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64 198 | topLevelTarget,topLevelTargetID,0.625,0.87,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88 199 | clientY,_keyStr,0.0682,0.1017,NAN,0.2,0.41,0.21,0.07,0.4,0.14,0.57 200 | oMatchesSelector,msMatchesSelector,0.6833,0.8614,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91 201 | re,destruct,0.0207,0.0684,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19 202 | translateX,translateY,0.2045,0.9527,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95 203 | click,dblclick,0.2333,0.9654,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62 204 | columns,requests,0.1537,0.1599,NAN,0.24,0.49,0.31,0.17,0.75,0.25,0.56 205 | a11,b12,0.2142,0.6284,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67 206 | pink,brown,0.0358,0.7215,0.2204,0.38,0.74,0.82,0.63,0.91,0.0,0.4 207 | floor,abs,0.1138,0.4563,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3 208 | math,miny,0.0192,0.3599,0.1581,0.47,0.63,0.44,0.27,0.35,0.25,0.62 209 | toggle,isLength,0.0147,0.0057,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44 210 | paddingRight,paddingTop,0.05,1.0,NAN,0.89,0.88,0.88,0.28,0.94,0.58,0.71 211 | dataAndEvents,deepDataAndEvents,0.6875,0.8375,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76 212 | found,rawFunc,0.0385,0.0998,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5 213 | alpha,rate,0.125,0.22,NAN,0.37,0.59,0.49,0.27,0.71,0.0,0.4 214 | layers,entries,0.2955,0.4327,NAN,0.38,0.5,0.44,0.22,0.67,0.14,0.5 215 | keyup,deferId,0.1345,0.1399,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43 216 | material,light,0.0832,0.3414,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31 217 | controller,mutator,0.3125,0.3716,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45 218 | files,locations,0.4423,0.48,0.8585,0.25,0.46,0.3,-0.04,0.82,0.22,0.39 219 | files,problems,0.0625,0.0738,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5 220 | frames,markers,0.2188,0.5611,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57 221 | objects,shortcuts,0.1168,0.1505,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56 222 | a01,b11,0.1965,0.48,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67 223 | defHeaders,defHeaderName,0.5208,0.9566,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73 224 | click,mouseup,0.5168,0.9134,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36 225 | inputs,resources,0.4107,0.6284,0.7932,0.29,0.45,0.32,-0.03,0.78,0.22,0.44 226 | advice,alternative,0.0715,0.1271,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41 227 | body,currants,0.125,0.22,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25 228 | a12,a13,0.4167,0.7616,NAN,0.96,0.97,0.96,0.5,0.98,0.67,0.83 229 | abs,sqrt,0.0207,0.74,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38 230 | cells,segments,0.5893,0.7028,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44 231 | place,written,0.0227,0.1253,NAN,0.15,0.41,0.2,0.11,0.42,0.14,0.43 232 | names,sources,0.577,0.6001,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5 233 | Util,isParam,0.075,0.168,NAN,0.27,0.52,0.29,0.35,0.4,0.0,0.29 234 | minx,ymax,0.125,0.7028,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62 235 | segments,annotations,0.25,0.3128,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45 236 | cols,domains,0.25,0.402,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43 237 | ELEMENT_ARRAY_BUFFER,bufferData,0.4772,0.8583,0.875,0.51,0.75,0.9,0.52,0.55,0.0,0.25 238 | cfg,conf,0.9667,0.896,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5 239 | userA,userB,0.4822,0.9256,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9 240 | columns,datasets,0.1922,0.4199,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5 241 | limit,exponent,0.1818,0.3383,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38 242 | oDomRef,structuralType,0.077,0.0,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25 243 | camera,texture,0.0578,0.1999,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5 244 | selected,active,0.7322,0.87,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44 245 | rows,columns,0.0715,0.87,0.2155,0.72,0.78,0.81,0.34,0.86,0.29,0.43 246 | nextTick,notification,0.175,0.298,NAN,0.32,0.43,0.39,0.02,0.33,0.25,0.46 247 | ui,secret,0.0625,0.0684,NAN,0.05,0.5,0.09,0.33,0.54,0.0,0.17 248 | objects,records,NAN,NAN,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57 249 | objects,images,NAN,NAN,0.8986,0.33,0.58,0.44,-0.21,0.73,0.14,0.5 250 | items,images,NAN,NAN,0.8436,0.39,0.6,0.4,-0.09,0.83,0.33,0.58 251 | canvas,image,NAN,NAN,0.9397,0.47,0.68,0.6,0.59,0.71,0.0,0.42 252 | indices,positions,NAN,NAN,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5 253 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56 254 | margin,padding,NAN,NAN,0.9369,0.62,0.74,0.78,0.62,0.87,0.43,0.64 255 | $behaviour,foldingRules,NAN,NAN,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46 256 | foo,abc,NAN,NAN,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5 257 | g,r,NAN,NAN,0.8674,0.53,0.84,0.75,0.68,0.71,0.0,0.5 258 | err,er,NAN,NAN,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67 259 | res,resp,NAN,NAN,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75 260 | gray,silver,NAN,NAN,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33 261 | get,facets,NAN,NAN,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42 262 | files,images,NAN,NAN,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58 263 | b01,a03,NAN,NAN,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67 264 | styleSelectLabel,tag_h4,NAN,NAN,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22 265 | a24,a30,NAN,NAN,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67 266 | lightgreen,lightgrey,NAN,NAN,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85 267 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64 268 | on,immediate,NAN,NAN,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11 269 | equals,same,NAN,NAN,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42 270 | wrapper,scroller,NAN,NAN,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62 271 | builtinConstants,buildinConstants,NAN,NAN,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97 272 | -------------------------------------------------------------------------------- /varclr/benchmarks/idbench/small_pair_wise.csv: -------------------------------------------------------------------------------- 1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW 2 | response,alert,0.25,0.6348,NAN,0.33,0.53,0.46,0.14,0.31,0.0,0.31 3 | ln,ilen,0.7728,0.7866,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5 4 | tasks,todos,0.9772,0.9288,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7 5 | images,authors,0.0207,0.3042,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5 6 | editable,dropdown,0.0625,0.3261,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5 7 | ReactDOMComponent,ReactTextComponent,0.4772,0.7154,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86 8 | foundMap,foundStarMap,0.7045,0.8341,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67 9 | columns,cols,0.9667,0.9653,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57 10 | dm,_queueHooks,0.1138,0.1461,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09 11 | pushStackLiteral,oldSelection,0.05,0.113,NAN,0.13,0.27,0.32,0.31,0.55,0.12,0.44 12 | fuchsia,pink,0.775,0.9478,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29 13 | ExpressionStatement,FunctionDeclaration,0.2,0.6173,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53 14 | object2,overlapOnly,0.077,0.0368,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36 15 | _selection,_sel,0.9772,0.9763,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4 16 | alignTop,popupLocationBar,0.1457,0.3042,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34 17 | miny,ymin,1,0.7913,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62 18 | equal,eql,1,1,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6 19 | item,entry,0.775,0.7391,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5 20 | image,polyline,0.225,0.4521,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38 21 | player,peer,0.4,0.6608,NAN,0.53,0.64,0.51,0.23,0.68,0.5,0.58 22 | files,profiles,0.175,0.2695,NAN,0.58,0.6,0.42,-0.01,0.77,0.62,0.62 23 | reset,clear,0.95,0.9478,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5 24 | reset,refresh,0.6332,0.7913,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57 25 | username,userid,0.577,0.9799,NAN,0.72,0.79,0.67,0.15,0.65,0.5,0.62 26 | clear,refresh,0.4375,0.6955,NAN,0.55,0.66,0.61,0.35,0.81,0.14,0.43 27 | olive,darkred,0.077,0.5787,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43 28 | selectAnchor,anchorName,0.35,0.7391,NAN,0.6,0.73,0.77,0.43,0.99,0.08,0.46 29 | names,filenames,0.625,0.7827,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56 30 | setInterval,clearInterval,0.0625,0.8912,NAN,0.89,0.9,0.96,0.53,0.9,0.69,0.77 31 | getRules,foldingRules,0.35,0.6608,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54 32 | self_msgs,they_effects,0.05,0.0347,NAN,0.5,0.83,0.9,0.3,0.89,0.25,0.5 33 | getInstanceProp,setInstanceProp,0,0.9478,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97 34 | emptyText,blankText,0.8958,0.9564,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72 35 | minText,maxText,0,0.9564,NAN,0.95,0.9,0.93,0.65,0.99,0.71,0.86 36 | maxText,disabledDaysText,0,0.0277,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38 37 | disabledDaysText,disabledDatesText,0.375,0.8956,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91 38 | keywordMapper,buildinConstants,0.05,0.0347,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41 39 | VM,invokePartial,0.0358,0.1427,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08 40 | blendMode,currentBlendMode,0.75,0.9739,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53 41 | touchmove,touchend,0.2323,0.8696,NAN,0.91,0.94,0.96,0.27,0.97,0.56,0.72 42 | bindBuffer,ARRAY_BUFFER,0.2833,0.6348,NAN,0.51,0.81,0.96,0.72,0.65,0.08,0.46 43 | click,mousedown,0.7857,0.9254,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28 44 | files,players,0.159,0.1698,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5 45 | _owner,nextElement,0.025,0.0347,NAN,0.28,0.59,0.84,0.34,0.36,0.09,0.32 46 | child,face,0.125,0.3694,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4 47 | displayMsg,emptyMsg,0.2045,0.6679,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65 48 | pseudoElements,pseudoClasses,0.475,0.7913,NAN,0.71,0.75,0.78,0.74,0.99,0.57,0.75 49 | lastName,firstName,0.0625,0.9022,NAN,0.9,0.94,0.97,0.41,0.9,0.67,0.78 50 | Int16Array,Uint16Array,0.453,0.951,NAN,0.91,0.91,0.94,0.18,0.83,0.82,0.86 51 | startSymbol,endSymbol,0.047,0.9838,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68 52 | decrypt,ciphertext,0.3595,0.8369,NAN,0.63,0.77,0.86,0.19,0.58,0.2,0.45 53 | rlocalProtocol,rurl,0.3235,0.5857,NAN,0.66,0.6,0.67,0.47,0.91,0.21,0.25 54 | hSpace,popupFeatures,0.0177,0.1427,0,0.25,0.48,0.6,0.49,0.86,0.23,0.35 55 | linkTab,alignMiddle,0.0625,0.0869,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45 56 | lockRatio,alignRight,0,0.1391,NAN,0.47,0.45,0.57,0.13,0.74,0.2,0.55 57 | substr,substring,1,0.9303,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67 58 | change,submit,0.4,0.426,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5 59 | columns,menus,0.4,0.5565,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5 60 | deltaX,deltaY,0.2,0.8956,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92 61 | MINUTE,SECOND,0.125,0.9564,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5 62 | onDragStart,onDragEnd,0.1457,0.9783,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68 63 | expect,bp,0.0578,0.0368,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25 64 | items,files,0.575,0.5826,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6 65 | disabled,visible,0.175,0.713,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75 66 | round,sqrt,0.0227,0.5732,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4 67 | teal,lightgrey,0.25,0.8696,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28 68 | navy,lightblue,0.3638,0.9525,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22 69 | navy,magenta,0.25,0.905,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36 70 | equal,cut,0.0455,0.1224,NAN,0.09,0.51,0.09,0.15,0.34,0.2,0.4 71 | start,begin,0.925,0.9739,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5 72 | visible,showing,0.85,0.9303,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5 73 | arrayClass,boolClass,0.1922,0.6387,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7 74 | me,br,0.159,0.0986,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5 75 | dispatchIDs,_dispatchIDs,0.7955,0.9525,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92 76 | getAnimation,depot,0.0227,0.0277,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29 77 | cx,sx,0.4107,0.4409,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75 78 | records,entries,0.9062,0.8857,1,0.41,0.53,0.52,0.14,0.76,0.29,0.57 79 | adjusted_scale,rangy,0.3333,0.4782,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18 80 | vSpace,advisoryTitleInputLabel,0.0578,0.0569,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22 81 | start,searches,0.1043,0.1521,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5 82 | len,ln,0.85,0.8956,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67 83 | self,that,0.175,0.3999,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5 84 | y,z,0.475,0.6608,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5 85 | element,elm,0.8862,0.9525,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43 86 | id,userid,0.8182,0.8578,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33 87 | miny,ypos,0.2955,0.6916,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5 88 | newLength,Group,0.0295,0.0177,NAN,0.15,0.41,0.1,0.13,0.32,0.0,0.28 89 | colspan,assignTo,0.0588,0.0793,NAN,0.1,0.39,0.21,0.22,0.08,0.12,0.44 90 | setScrollTop,prefixWith,0.0973,0.1594,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42 91 | getMinutes,getUTCMinutes,0.797,0.9348,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77 92 | FONTDATA,FONTS,0.5833,0.8912,NAN,0.92,0.92,0.95,0.54,0.75,0.5,0.56 93 | ReactEmptyComponent,renderToStaticMarkup,0.0735,0.0485,NAN,0.6,0.65,0.65,0.22,0.55,0.1,0.52 94 | VERSION,geoJson,0.0938,0.1195,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5 95 | s1,s2,0.4615,0.7592,NAN,0.97,0.96,0.99,0.56,0.89,0.5,0.75 96 | precondition,prereq,0.8393,0.9254,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38 97 | setAttributeNode,createAttribute,0.3928,0.7764,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62 98 | Connection,Client,0.2082,0.413,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45 99 | dom__FixedSizeListIterator,_dom_pos,0.225,0.5043,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21 100 | v2,v3,0.2917,0.8043,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75 101 | onAdd,onRemove,0.025,0.9739,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44 102 | destroy,clear,0.675,0.8435,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43 103 | panelTitle,panelTitle1,0.923,0.9598,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91 104 | __dependency1__,__dependency2__,0.4375,0.9186,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97 105 | minVal,minValue,0.9583,0.9564,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75 106 | ace,__ace_shadowed__,0.5,0.8174,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19 107 | m1,m2,0.6607,0.8696,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75 108 | latitude,longitude,0.0207,0.9564,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78 109 | cols,rects,0.175,0.4521,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5 110 | resetSize,popupFeatures,0.0358,0.1615,NAN,0.13,0.49,0.58,0.29,0.75,0.15,0.42 111 | trim,separatedList,0.2045,0.336,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23 112 | parentWindow,scanner,0.0682,0.0039,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38 113 | targetPopup,popupResizable,0.2955,0.6204,NAN,0.51,0.63,0.85,0.45,NAN,0.07,0.43 114 | popupToolbar,popupDependent,0.173,0.6186,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61 115 | split,shim,0.2728,0.3595,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6 116 | generalTab,advancedTab,0.3333,0.8696,0,0.63,0.76,0.91,0.77,0.98,0.27,0.59 117 | ELEMENT_NODE,TEXT_NODE,0.4808,0.8195,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67 118 | rlocalProtocol,rprotocol,0.6785,0.8881,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61 119 | listeners,cbs,0.125,0.1391,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22 120 | user,person,0.9375,0.8912,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42 121 | onPlay,onPause,0.1608,0.8881,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64 122 | re,destruct,0.0207,0.0652,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19 123 | translateX,translateY,0.225,0.9478,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95 124 | click,dblclick,0.2333,0.9653,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62 125 | columns,requests,0.1362,0.1224,NAN,0.24,0.49,0.31,0.17,0.75,0.25,0.56 126 | oMatchesSelector,msMatchesSelector,0.6667,0.8912,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91 127 | a11,b12,0.2308,0.6788,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67 128 | pink,brown,0.0385,0.7391,NAN,0.38,0.74,0.82,0.63,0.91,0.0,0.4 129 | floor,abs,0.1138,0.4545,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3 130 | math,miny,0.0227,0.3123,NAN,0.47,0.63,0.44,0.27,0.35,0.25,0.62 131 | toggle,isLength,0.0177,0.0123,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44 132 | found,rawFunc,0.0385,0.0968,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5 133 | material,light,0.0832,0.3392,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31 134 | controller,mutator,0.325,0.3739,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45 135 | files,locations,0.5,0.5494,NAN,0.25,0.46,0.3,-0.04,0.82,0.22,0.39 136 | files,problems,0.0535,0.0684,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5 137 | frames,markers,0.2115,0.5787,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57 138 | objects,shortcuts,0.1138,0.1461,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56 139 | a01,b11,0.173,0.4581,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67 140 | defHeaders,defHeaderName,0.5,0.9763,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73 141 | right,bottom,0.1043,0.7827,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42 142 | count,total,0.8462,0.7994,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6 143 | click,mouseup,0.5168,0.9131,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36 144 | inputs,resources,0.3845,0.6186,NAN,0.29,0.45,0.32,-0.03,0.78,0.22,0.44 145 | advice,alternative,0.075,0.113,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41 146 | abs,sqrt,0.0227,0.7154,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38 147 | cells,segments,0.6155,0.7391,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44 148 | place,written,0.025,0.113,NAN,0.15,0.41,0.2,0.11,0.42,0.14,0.43 149 | names,sources,0.575,0.5826,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5 150 | minx,ymax,0.1043,0.6739,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62 151 | segments,annotations,0.25,0.3178,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45 152 | ELEMENT_ARRAY_BUFFER,bufferData,0.475,0.8696,NAN,0.51,0.75,0.9,0.52,0.55,0.0,0.25 153 | body,currants,0.15,0.2173,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25 154 | cfg,conf,1,0.905,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5 155 | userA,userB,0.5192,0.9397,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9 156 | keyup,deferId,0.075,0.0869,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43 157 | columns,datasets,0.2045,0.4545,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5 158 | getBorderWidth,getPadding,0.4317,0.8103,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54 159 | obj1,obj2,0.6362,0.7866,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88 160 | limit,exponent,0.1818,0.336,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38 161 | camera,texture,0.0625,0.1738,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5 162 | selected,active,0.8125,0.9131,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44 163 | rows,columns,0.025,0.9217,NAN,0.72,0.78,0.81,0.34,0.86,0.29,0.43 164 | body,agg,0.05,0.0869,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38 165 | ui,secret,0.0625,0.0652,NAN,0.05,0.5,0.09,0.33,0.54,0.0,0.17 166 | dataAndEvents,deepDataAndEvents,0.7143,0.851,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76 167 | oDomRef,structuralType,0.0832,0,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25 168 | i,targ,NAN,NAN,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12 169 | allocate,contextmenu,NAN,NAN,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45 170 | objects,records,NAN,NAN,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57 171 | λ0,φ0,NAN,NAN,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75 172 | idx,indx,NAN,NAN,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75 173 | idx,ridx,NAN,NAN,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75 174 | ranges,codes,NAN,NAN,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58 175 | topLevelTarget,topLevelTargetID,NAN,NAN,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88 176 | store,storage,NAN,NAN,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71 177 | indices,positions,NAN,NAN,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5 178 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56 179 | λ0,λ1,NAN,NAN,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75 180 | cosφ0,cosφ,NAN,NAN,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8 181 | maxLine,maxLineLength,NAN,NAN,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54 182 | alignCenter,alignMiddle,NAN,NAN,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73 183 | targetFrame,targetFrameName,NAN,NAN,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73 184 | angle,radians,NAN,NAN,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43 185 | events,rchecked,NAN,NAN,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44 186 | img,thumb,NAN,NAN,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4 187 | PLACEHOLDER,vertexFormat,NAN,NAN,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46 188 | angle,theta,NAN,NAN,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5 189 | foo,bar,NAN,NAN,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5 190 | $behaviour,foldingRules,NAN,NAN,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46 191 | foo,abc,NAN,NAN,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5 192 | cols,domains,NAN,NAN,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43 193 | CallExpression,BlockStatement,NAN,NAN,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5 194 | raw,movie,NAN,NAN,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3 195 | err,er,NAN,NAN,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67 196 | res,resp,NAN,NAN,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75 197 | utils,util,NAN,NAN,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8 198 | remove,focus,NAN,NAN,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42 199 | items,records,NAN,NAN,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43 200 | bindBuffer,ELEMENT_ARRAY_BUFFER,NAN,NAN,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28 201 | gray,silver,NAN,NAN,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33 202 | get,facets,NAN,NAN,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42 203 | files,images,NAN,NAN,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58 204 | nodeLinkFn,childLinkFn,NAN,NAN,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73 205 | b01,a03,NAN,NAN,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67 206 | styleSelectLabel,tag_h4,NAN,NAN,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22 207 | a24,a30,NAN,NAN,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67 208 | lightgreen,lightgrey,NAN,NAN,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85 209 | id,sessionid,NAN,NAN,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22 210 | expect,assume,NAN,NAN,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5 211 | container,submenu,NAN,NAN,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44 212 | images,streams,NAN,NAN,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5 213 | btnModify,btnSetValue,NAN,NAN,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55 214 | setMinutes,setSeconds,NAN,NAN,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7 215 | styleSelectLabel,cssClassInputLabel,NAN,NAN,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67 216 | item,record,NAN,NAN,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33 217 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64 218 | on,immediate,NAN,NAN,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11 219 | equals,same,NAN,NAN,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42 220 | wrapper,scroller,NAN,NAN,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62 221 | builtinConstants,buildinConstants,NAN,NAN,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97 222 | -------------------------------------------------------------------------------- /varclr/data/__init__.py: -------------------------------------------------------------------------------- 1 | from varclr.data.dataset import RenamesDataModule 2 | from varclr.data.preprocessor import Preprocessor 3 | 4 | __all__ = [ 5 | "RenamesDataModule", 6 | "Preprocessor", 7 | ] 8 | -------------------------------------------------------------------------------- /varclr/data/dataset.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import random 4 | from typing import List, Optional, Text, Tuple 5 | 6 | import numpy as np 7 | import pytorch_lightning as pl 8 | import torch 9 | from torch.nn.utils.rnn import pad_sequence 10 | from torch.utils.data import DataLoader, Dataset, random_split 11 | 12 | from varclr.data.preprocessor import Preprocessor 13 | from varclr.data.vocab import Vocab 14 | from varclr.models.tokenizers import PretrainedTokenizer 15 | 16 | 17 | class Example(object): 18 | def __init__(self, sentence): 19 | self.sentence = sentence.strip().lower() 20 | self.embeddings = [] 21 | 22 | def populate_embeddings( 23 | self, words, zero_unk, tokenization, ngrams, scramble_rate=0 24 | ): 25 | self.embeddings = [] 26 | if tokenization == "ngrams": 27 | sentence = " " + self.sentence.strip() + " " 28 | for j in range(len(sentence)): 29 | idx = j 30 | gr = "" 31 | while idx < j + ngrams and idx < len(sentence): 32 | gr += sentence[idx] 33 | idx += 1 34 | if not len(gr) == ngrams: 35 | continue 36 | wd = Vocab.lookup(words, gr, zero_unk) 37 | if wd is not None: 38 | self.embeddings.append(wd) 39 | elif tokenization == "sp": 40 | arr = self.sentence.split() 41 | if scramble_rate: 42 | if random.random() <= scramble_rate: 43 | random.shuffle(arr) 44 | for i in arr: 45 | wd = Vocab.lookup(words, i, zero_unk) 46 | if wd is not None: 47 | self.embeddings.append(wd) 48 | else: 49 | raise NotImplementedError 50 | if len(self.embeddings) == 0: 51 | self.embeddings = [words[Vocab.unk_string]] 52 | 53 | 54 | class RenamesDataset(Dataset): 55 | def __init__(self, data_file: str, args, training=True) -> None: 56 | super().__init__() 57 | self.data_file = data_file 58 | self.tokenization = args.tokenization 59 | self.ngrams = args.ngrams 60 | self.scramble_rate = args.scramble_rate 61 | self.zero_unk = args.zero_unk 62 | self.training = training 63 | self.processor1, self.processor2 = Preprocessor.build(data_file, args) 64 | self.examples_pairs = self.read_examples() 65 | if not os.path.exists(args.vocab_path): 66 | print(f"Vocab not found. Creating from {data_file}") 67 | self.vocab = Vocab.build(self.examples_pairs, args) 68 | torch.save(self.vocab, args.vocab_path) 69 | else: 70 | self.vocab = torch.load(args.vocab_path) 71 | 72 | def __getitem__(self, i): 73 | self.examples_pairs[i][0].populate_embeddings( 74 | self.vocab, 75 | self.zero_unk, 76 | self.tokenization, 77 | self.ngrams, 78 | scramble_rate=self.scramble_rate if self.training else 0, 79 | ) 80 | self.examples_pairs[i][1].populate_embeddings( 81 | self.vocab, 82 | self.zero_unk, 83 | self.tokenization, 84 | self.ngrams, 85 | scramble_rate=self.scramble_rate if self.training else 0, 86 | ) 87 | return self.examples_pairs[i] 88 | 89 | def __len__(self): 90 | return len(self.examples_pairs) 91 | 92 | def read_examples(self): 93 | examples = [] 94 | finished = set([]) # check for duplicates 95 | spliter = "," if "csv" in self.data_file else "\t" 96 | with io.open(self.data_file, "r", encoding="utf-8") as f: 97 | for idx, i in enumerate(f): 98 | if "csv" in self.data_file and idx == 0: 99 | # skip the first line in IdBench csv 100 | continue 101 | if i in finished: 102 | continue 103 | else: 104 | finished.add(i) 105 | 106 | i = i.split(spliter) 107 | if len(i[0].strip()) == 0 or len(i[1].strip()) == 0: 108 | continue 109 | 110 | i[0] = self.processor1(i[0]) 111 | i[1] = self.processor2(i[1]) 112 | 113 | if self.training: 114 | e = (Example(i[0]), Example(i[1])) 115 | else: 116 | if np.isnan(float(i[2])): 117 | continue 118 | e = (Example(i[0]), Example(i[1]), float(i[2])) 119 | examples.append(e) 120 | return examples 121 | 122 | @staticmethod 123 | def collate_fn(example_pairs): 124 | def torchify(batch: List[Example]): 125 | idxs = pad_sequence( 126 | [torch.tensor(ex.embeddings, dtype=torch.long) for ex in batch], 127 | batch_first=True, 128 | ) 129 | lengths = torch.tensor([len(e.embeddings) for e in batch], dtype=torch.long) 130 | return idxs, lengths 131 | 132 | ret = torchify([pair[0] for pair in example_pairs]), torchify( 133 | [pair[1] for pair in example_pairs] 134 | ) 135 | if len(example_pairs[0]) == 3: 136 | return ret[0], ret[1], torch.tensor([e[2] for e in example_pairs]) 137 | else: 138 | return ret 139 | 140 | @staticmethod 141 | def collate_fn_transformers(example_pairs): 142 | def torchify(batch: List[Example]): 143 | tokenizer = PretrainedTokenizer.get_instance() 144 | ret = tokenizer( 145 | [ex.sentence for ex in batch], return_tensors="pt", padding=True 146 | ) 147 | return ret["input_ids"], ret["attention_mask"] 148 | 149 | ret = torchify([pair[0] for pair in example_pairs]), torchify( 150 | [pair[1] for pair in example_pairs] 151 | ) 152 | if len(example_pairs[0]) == 3: 153 | return ret[0], ret[1], torch.tensor([e[2] for e in example_pairs]) 154 | else: 155 | return ret 156 | 157 | 158 | class RenamesDataModule(pl.LightningDataModule): 159 | def __init__( 160 | self, train_data_file: str, valid_data_file: str, test_data_files: str, args 161 | ): 162 | super().__init__() 163 | self.train_data_file = train_data_file 164 | self.valid_data_file = valid_data_file 165 | self.test_data_files = test_data_files.split(",") 166 | self.train_percent = args.train_percent 167 | self.args = args 168 | 169 | def prepare_data(self): 170 | assert os.path.exists(self.train_data_file) 171 | assert all(os.path.exists(test) for test in self.test_data_files) 172 | 173 | def setup(self, stage=None): 174 | 175 | # Assign train/val datasets for use in dataloaders 176 | if stage == "fit" or stage is None: 177 | self.train = RenamesDataset(self.train_data_file, self.args, training=True) 178 | if self.valid_data_file is None: 179 | self.train, self.valid = random_split( 180 | self.train, [len(self.train) - 1000, 1000] 181 | ) 182 | self.valid.training = False 183 | self.valid.data_file = self.train_data_file 184 | else: 185 | self.valid = RenamesDataset( 186 | self.valid_data_file, self.args, training=False 187 | ) 188 | num_for_training = int(len(self.train) * self.train_percent) 189 | self.train = random_split( 190 | self.train, [num_for_training, len(self.train) - num_for_training] 191 | )[0] 192 | 193 | # Assign test dataset for use in dataloader(s) 194 | if stage == "test" or stage is None: 195 | self.tests = [ 196 | RenamesDataset(test_data_file, self.args, training=False) 197 | for test_data_file in self.test_data_files 198 | ] 199 | 200 | def train_dataloader(self): 201 | return DataLoader( 202 | self.train, 203 | batch_size=self.args.batch_size, 204 | num_workers=self.args.num_workers, 205 | collate_fn=RenamesDataset.collate_fn 206 | if self.args.model != "bert" 207 | else RenamesDataset.collate_fn_transformers, 208 | ) 209 | 210 | def val_dataloader(self): 211 | return DataLoader( 212 | self.valid, 213 | batch_size=self.args.batch_size, 214 | num_workers=self.args.num_workers, 215 | collate_fn=RenamesDataset.collate_fn 216 | if self.args.model != "bert" 217 | else RenamesDataset.collate_fn_transformers, 218 | ) 219 | 220 | def test_dataloader(self): 221 | return [ 222 | DataLoader( 223 | test, 224 | batch_size=self.args.batch_size, 225 | num_workers=self.args.num_workers, 226 | collate_fn=RenamesDataset.collate_fn 227 | if self.args.model != "bert" 228 | else RenamesDataset.collate_fn_transformers, 229 | ) 230 | for test in self.tests 231 | ] 232 | -------------------------------------------------------------------------------- /varclr/data/preprocessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple, Union 3 | 4 | from sacremoses import MosesTokenizer 5 | 6 | from varclr.models.tokenizers import Tokenizer 7 | 8 | 9 | class Preprocessor: 10 | @staticmethod 11 | def build(data_file, args) -> Tuple["Preprocessor", "Preprocessor"]: 12 | if "STS" in data_file: 13 | print(f"Using STS processor for {data_file}") 14 | return STSTextPreprocessor.from_args(args), STSTextPreprocessor.from_args( 15 | args 16 | ) 17 | elif "idbench" in data_file: 18 | print(f"Using code processor for {data_file}") 19 | return CodePreprocessor.from_args(args), CodePreprocessor.from_args(args) 20 | elif "20k" in data_file: 21 | return Preprocessor(), Preprocessor() 22 | elif "nli" in data_file or "cs-cs" in data_file: 23 | print(f"Using NLI processor for {data_file}") 24 | return NLITextPreprocessor.from_args(args), NLITextPreprocessor.from_args( 25 | args 26 | ) 27 | else: 28 | raise NotImplementedError 29 | 30 | def __call__(self, sentence): 31 | return sentence 32 | 33 | 34 | class NLITextPreprocessor(Preprocessor): 35 | def __init__(self, tokenization, sp_model) -> None: 36 | self.tokenization = tokenization 37 | if self.tokenization == "sp": 38 | self.tokenizer = Tokenizer.build(sp_model) 39 | 40 | @staticmethod 41 | def from_args(args) -> "NLITextPreprocessor": 42 | return NLITextPreprocessor(args.tokenization, args.sp_model) 43 | 44 | def __call__(self, sentence): 45 | sent = sentence.lower() 46 | if self.tokenization == "sp": 47 | sent = " ".join(self.tokenizer.encode(sent)) 48 | return sent 49 | 50 | 51 | class STSTextPreprocessor(Preprocessor): 52 | def __init__(self, lang, tokenization, sp_model) -> None: 53 | self.moses = MosesTokenizer(lang=lang) 54 | self.tokenization = tokenization 55 | if self.tokenization == "sp": 56 | self.tokenizer = Tokenizer.build(sp_model) 57 | 58 | @staticmethod 59 | def from_args(args) -> "STSTextPreprocessor": 60 | return STSTextPreprocessor(args.tokenization, args.sp_model) 61 | 62 | def __call__(self, sentence): 63 | sent = " ".join(self.moses.tokenize(sentence)) 64 | sent = sent.lower() 65 | if self.tokenization == "sp": 66 | sent = " ".join(self.tokenizer.encode(sent)) 67 | return sent 68 | 69 | 70 | class CodePreprocessor(Preprocessor): 71 | def __init__(self, tokenization=None, sp_model=None): 72 | self.tokenization = tokenization 73 | if self.tokenization == "sp": 74 | self.tokenizer = Tokenizer.build(sp_model) 75 | 76 | @staticmethod 77 | def from_args(args) -> "CodePreprocessor": 78 | return CodePreprocessor(args.tokenization, args.sp_model) 79 | 80 | def __call__(self, var: Union[str, List[str]]): 81 | if isinstance(var, str): 82 | return self._process(var) 83 | elif isinstance(var, list) and all(isinstance(v, str) for v in var): 84 | return [self._process(v) for v in var] 85 | else: 86 | raise NotImplementedError 87 | 88 | def _process(self, var): 89 | var = var.replace("@", "") 90 | var = ( 91 | re.sub("([a-z]|^)([A-Z]{1})", r"\1_\2", var) 92 | .lower() 93 | .replace("_", " ") 94 | .strip() 95 | ) 96 | if self.tokenization == "sp": 97 | var = " ".join(self.tokenizer.encode(var)) 98 | return var 99 | -------------------------------------------------------------------------------- /varclr/data/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | 4 | class Vocab: 5 | unk_string = "UUUNKKK" 6 | 7 | @staticmethod 8 | def build(examples, args): 9 | if args.tokenization == "ngrams": 10 | return Vocab.get_ngrams(examples, n=args.ngrams) 11 | elif args.tokenization == "sp": 12 | return Vocab.get_words(examples) 13 | else: 14 | raise NotImplementedError 15 | 16 | @staticmethod 17 | def get_ngrams(examples, max_len=200000, n=3): 18 | def update_counter(counter, sentence): 19 | word = " " + sentence.strip() + " " 20 | lis = [] 21 | for j in range(len(word)): 22 | idx = j 23 | ngram = "" 24 | while idx < j + n and idx < len(word): 25 | ngram += word[idx] 26 | idx += 1 27 | if not len(ngram) == n: 28 | continue 29 | lis.append(ngram) 30 | counter.update(lis) 31 | 32 | counter = Counter() 33 | 34 | for i in examples: 35 | update_counter(counter, i[0].sentence) 36 | update_counter(counter, i[1].sentence) 37 | 38 | counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0:max_len] 39 | 40 | vocab = {} 41 | for i in counter: 42 | vocab[i[0]] = len(vocab) 43 | 44 | vocab[Vocab.unk_string] = len(vocab) 45 | return vocab 46 | 47 | @staticmethod 48 | def get_words(examples, max_len=200000): 49 | def update_counter(counter, sentence): 50 | counter.update(sentence.split()) 51 | 52 | counter = Counter() 53 | 54 | for i in examples: 55 | update_counter(counter, i[0].sentence) 56 | update_counter(counter, i[1].sentence) 57 | 58 | counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0:max_len] 59 | 60 | vocab = {} 61 | for i in counter: 62 | vocab[i[0]] = len(vocab) 63 | 64 | vocab[Vocab.unk_string] = len(vocab) 65 | return vocab 66 | 67 | @staticmethod 68 | def lookup(words, w, zero_unk): 69 | w = w.lower() 70 | if w in words: 71 | return words[w] 72 | else: 73 | if zero_unk: 74 | return None 75 | else: 76 | return words[Vocab.unk_string] 77 | -------------------------------------------------------------------------------- /varclr/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/varclr/models/__init__.py -------------------------------------------------------------------------------- /varclr/models/encoders.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Union 3 | 4 | import gdown 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.utils.rnn import pack_padded_sequence as pack 9 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 10 | from torch.nn.utils.rnn import pad_sequence 11 | from transformers import AutoModel, AutoTokenizer 12 | 13 | from varclr.data.preprocessor import CodePreprocessor 14 | from varclr.data.vocab import Vocab 15 | from varclr.models import urls_pretrained_model 16 | 17 | 18 | class Encoder(nn.Module): 19 | @staticmethod 20 | def build(args) -> "Encoder": 21 | return {"avg": Averaging, "lstm": LSTM, "bert": BERT}[args.model].from_args( 22 | args 23 | ) 24 | 25 | @staticmethod 26 | def from_pretrained(model_name: str, save_path: str = "saved/") -> "Encoder": 27 | return { 28 | "varclr-avg": Averaging, 29 | "varclr-lstm": LSTM, 30 | "varclr-codebert": BERT, 31 | "codebert": CodeBERT, 32 | }[model_name].load(save_path) 33 | 34 | @staticmethod 35 | def from_args(args) -> "Encoder": 36 | raise NotImplementedError 37 | 38 | @staticmethod 39 | def load(save_path: str) -> "Encoder": 40 | raise NotImplementedError 41 | 42 | def forward(self, idxs, lengths): 43 | raise NotImplementedError 44 | 45 | def encode(self, inputs: Union[str, List[str]]) -> torch.Tensor: 46 | raise NotImplementedError 47 | 48 | def score( 49 | self, inputx: Union[str, List[str]], inputy: Union[str, List[str]] 50 | ) -> List[float]: 51 | if type(inputx) != type(inputy): 52 | raise Exception("Input X and Y must be either string or list of strings.") 53 | if isinstance(inputx, list) and len(inputx) != len(inputy): 54 | raise Exception("Input X and Y must have the same length") 55 | embx = self.encode(inputx) 56 | emby = self.encode(inputy) 57 | return F.cosine_similarity(embx, emby).tolist() 58 | 59 | def cross_score( 60 | self, inputx: Union[str, List[str]], inputy: Union[str, List[str]] 61 | ) -> List[List[float]]: 62 | if isinstance(inputx, str): 63 | inputx = [inputx] 64 | if isinstance(inputy, str): 65 | inputy = [inputy] 66 | assert all(isinstance(inp, str) for inp in inputx) 67 | assert all(isinstance(inp, str) for inp in inputy) 68 | embx = self.encode(inputx) 69 | embx /= embx.norm(dim=1, keepdim=True) 70 | emby = self.encode(inputy) 71 | emby /= emby.norm(dim=1, keepdim=True) 72 | return (embx @ emby.t()).tolist() 73 | 74 | @staticmethod 75 | def decor_forward(model_forward): 76 | """Decorate an encoder's forward pass to deal with raw inputs.""" 77 | processor = CodePreprocessor( 78 | tokenization="sp", sp_model=urls_pretrained_model.PRETRAINED_TOKENIZER 79 | ) 80 | 81 | def torchify(batch): 82 | idxs = pad_sequence( 83 | [torch.tensor(ex, dtype=torch.long) for ex in batch], 84 | batch_first=True, 85 | ) 86 | lengths = torch.tensor([len(e) for e in batch], dtype=torch.long) 87 | return idxs, lengths 88 | 89 | def tokenize_and_forward(self, inputs: Union[str, List[str]]) -> torch.Tensor: 90 | if isinstance(inputs, str): 91 | inputs = [inputs] 92 | var_ids = processor(inputs) 93 | batch = torchify( 94 | [ 95 | [ 96 | Vocab.lookup(self.vocab, w, True) 97 | for w in var.split() 98 | if Vocab.lookup(self.vocab, w, True) is not None 99 | ] 100 | or [self.vocab[Vocab.unk_string]] 101 | for var in var_ids 102 | ] 103 | ) 104 | idxs, lengths = batch 105 | return model_forward(self, idxs, lengths)[0].detach() 106 | 107 | return tokenize_and_forward 108 | 109 | @staticmethod 110 | def decor_bert_forward(model_forward): 111 | """Decorate an encoder's forward pass to deal with raw inputs.""" 112 | processor = CodePreprocessor() 113 | tokenizer = AutoTokenizer.from_pretrained( 114 | urls_pretrained_model.PRETRAINED_TOKENIZER 115 | ) 116 | 117 | def tokenize_and_forward(self, inputs: Union[str, List[str]]) -> torch.Tensor: 118 | inputs = processor(inputs) 119 | return_dict = tokenizer(inputs, return_tensors="pt", padding=True) 120 | return model_forward( 121 | self, return_dict["input_ids"], return_dict["attention_mask"] 122 | )[0].detach() 123 | 124 | return tokenize_and_forward 125 | 126 | 127 | class Averaging(Encoder): 128 | def __init__(self, vocab_size, dim, dropout): 129 | super().__init__() 130 | self.embedding = nn.Embedding(vocab_size, dim) 131 | self.dropout = dropout 132 | 133 | @staticmethod 134 | def from_args(args): 135 | return Averaging(args.vocab_size, args.dim, args.dropout) 136 | 137 | @staticmethod 138 | def load(save_path: str) -> Encoder: 139 | gdown.cached_download( 140 | urls_pretrained_model.PRETRAINED_AVG_URL, 141 | os.path.join(save_path, "lstm.zip"), 142 | md5=urls_pretrained_model.PRETRAINED_AVG_MD5, 143 | postprocess=gdown.extractall, 144 | ) 145 | state_dict = torch.load( 146 | os.path.join( 147 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "model" 148 | ), 149 | map_location=torch.device("cpu"), 150 | ) 151 | vocab_size, dim = state_dict["encoder.embedding.weight"].shape 152 | m = nn.Module() 153 | m.encoder = Averaging(vocab_size, dim, 0) 154 | m.load_state_dict(state_dict) 155 | m = m.encoder 156 | # HACK: for inference 157 | vocab = torch.load( 158 | os.path.join( 159 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "vocab" 160 | ) 161 | ) 162 | m.vocab = vocab 163 | return m 164 | 165 | def forward(self, idxs, lengths): 166 | word_embs = self.embedding(idxs) 167 | word_embs = F.dropout(word_embs, p=self.dropout, training=self.training) 168 | 169 | bs, max_len, _ = word_embs.shape 170 | mask = ( 171 | torch.arange(max_len).to(word_embs.device).expand(bs, max_len) 172 | < lengths.unsqueeze(1) 173 | ).float() 174 | pooled = (word_embs * mask.unsqueeze(dim=2)).sum(dim=1) 175 | pooled = pooled / lengths.unsqueeze(dim=1) 176 | 177 | return pooled, (word_embs, mask) 178 | 179 | encode = Encoder.decor_forward(forward) 180 | 181 | 182 | class LSTM(Encoder): 183 | def __init__(self, hidden_dim, dropout, vocab_size, dim): 184 | super(LSTM, self).__init__() 185 | 186 | self.hidden_dim = hidden_dim 187 | self.dropout = dropout 188 | 189 | self.register_buffer("e_hidden_init", torch.zeros(2, 1, hidden_dim)) 190 | self.register_buffer("e_cell_init", torch.zeros(2, 1, hidden_dim)) 191 | 192 | self.embedding = nn.Embedding(vocab_size, dim) 193 | self.lstm = nn.LSTM( 194 | dim, 195 | hidden_dim, 196 | num_layers=1, 197 | bidirectional=True, 198 | batch_first=True, 199 | ) 200 | 201 | @staticmethod 202 | def from_args(args): 203 | return LSTM(args.hidden_dim, args.dropout, args.vocab_size, args.dim) 204 | 205 | @staticmethod 206 | def load(save_path: str) -> Encoder: 207 | gdown.cached_download( 208 | urls_pretrained_model.PRETRAINED_LSTM_URL, 209 | os.path.join(save_path, "lstm.zip"), 210 | md5=urls_pretrained_model.PRETRAINED_LSTM_MD5, 211 | postprocess=gdown.extractall, 212 | ) 213 | state_dict = torch.load( 214 | os.path.join( 215 | save_path, urls_pretrained_model.PRETRAINED_LSTM_FOLDER, "model" 216 | ), 217 | map_location=torch.device("cpu"), 218 | ) 219 | hidden_dim = state_dict["encoder.e_hidden_init"].shape[2] 220 | vocab_size, dim = state_dict["encoder.embedding.weight"].shape 221 | m = nn.Module() 222 | m.encoder = LSTM(hidden_dim, 0, vocab_size, dim) 223 | m.load_state_dict(state_dict) 224 | m = m.encoder 225 | # HACK: for inference 226 | vocab = torch.load( 227 | os.path.join( 228 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "vocab" 229 | ) 230 | ) 231 | m.vocab = vocab 232 | return m 233 | 234 | def forward(self, inputs, lengths): 235 | bsz, max_len = inputs.size() 236 | e_hidden_init = self.e_hidden_init.expand(2, bsz, self.hidden_dim).contiguous() 237 | e_cell_init = self.e_cell_init.expand(2, bsz, self.hidden_dim).contiguous() 238 | lens, indices = torch.sort(lengths, 0, True) 239 | 240 | in_embs = self.embedding(inputs) 241 | in_embs = F.dropout(in_embs, p=self.dropout, training=self.training) 242 | 243 | all_hids, (enc_last_hid, _) = self.lstm( 244 | pack(in_embs[indices], lens.tolist(), batch_first=True), 245 | (e_hidden_init, e_cell_init), 246 | ) 247 | 248 | _, _indices = torch.sort(indices, 0) 249 | all_hids = unpack(all_hids, batch_first=True)[0][_indices] 250 | 251 | bs, max_len, _ = all_hids.shape 252 | mask = ( 253 | torch.arange(max_len).to(in_embs.device).expand(bs, max_len) 254 | < lengths.unsqueeze(1) 255 | ).float() 256 | pooled = (all_hids * mask.unsqueeze(dim=2)).sum(dim=1) 257 | pooled = pooled / lengths.unsqueeze(dim=1) 258 | 259 | return pooled, (all_hids, mask) 260 | 261 | encode = Encoder.decor_forward(forward) 262 | 263 | 264 | class BERT(Encoder): 265 | """VarCLR-CodeBERT Model.""" 266 | 267 | def __init__(self, bert_model: str, last_n_layer_output: int = 4): 268 | super().__init__() 269 | self.transformer = AutoModel.from_pretrained(bert_model) 270 | self.last_n_layer_output = last_n_layer_output 271 | 272 | @staticmethod 273 | def from_args(args): 274 | return BERT(args.bert_model, args.last_n_layer_output) 275 | 276 | @staticmethod 277 | def load(save_path: str) -> "BERT": 278 | gdown.cached_download( 279 | urls_pretrained_model.PRETRAINED_CODEBERT_URL, 280 | os.path.join(save_path, "bert.zip"), 281 | md5=urls_pretrained_model.PRETRAINED_CODEBERT_MD5, 282 | postprocess=gdown.extractall, 283 | ) 284 | return BERT( 285 | bert_model=os.path.join( 286 | save_path, urls_pretrained_model.PRETRAINED_CODEBERT_FOLDER 287 | ) 288 | ) 289 | 290 | def forward(self, input_ids, attention_mask): 291 | output = self.transformer( 292 | input_ids=input_ids, 293 | attention_mask=attention_mask, 294 | output_hidden_states=True, 295 | ) 296 | all_hids = output.hidden_states 297 | pooled = all_hids[-self.last_n_layer_output][:, 0] 298 | 299 | return pooled, (all_hids, attention_mask) 300 | 301 | encode = Encoder.decor_bert_forward(forward) 302 | 303 | 304 | class CodeBERT(BERT): 305 | """Original CodeBERT model https://github.com/microsoft/CodeBERT.""" 306 | 307 | @staticmethod 308 | def load(save_path: str) -> BERT: 309 | return BERT(bert_model="microsoft/codebert-base") 310 | -------------------------------------------------------------------------------- /varclr/models/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class NCESoftmaxLoss(nn.Module): 6 | """Softmax cross-entropy loss (a.k.a., info-NCE loss in CPC paper)""" 7 | 8 | def __init__(self, nce_t): 9 | super(NCESoftmaxLoss, self).__init__() 10 | self.loss = nn.CrossEntropyLoss(reduction="none") 11 | self.nce_t = nce_t 12 | 13 | def forward(self, x_ret, y_ret): 14 | x, _ = x_ret 15 | y, _ = y_ret 16 | bsz = x.shape[0] 17 | scores = ( 18 | (x / torch.norm(x, dim=1, keepdim=True)) 19 | @ (y / torch.norm(y, dim=1, keepdim=True)).t() 20 | / self.nce_t 21 | ) 22 | label = torch.arange(bsz, device=x.device) 23 | loss = self.loss(scores, label) + self.loss(scores.t(), label) 24 | return loss 25 | -------------------------------------------------------------------------------- /varclr/models/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytorch_lightning as pl 4 | import torch 5 | import torch.nn.functional as F 6 | from scipy.stats import pearsonr, spearmanr 7 | from torch import optim 8 | from varclr.models.encoders import Encoder 9 | from varclr.models.loss import NCESoftmaxLoss 10 | 11 | 12 | class Model(pl.LightningModule): 13 | def __init__(self, args): 14 | super(Model, self).__init__() 15 | self.args = args 16 | 17 | self.dropout = args.dropout 18 | self.loss = NCESoftmaxLoss(args.nce_t) 19 | args.vocab_size = len(torch.load(args.vocab_path)) 20 | args.parentmodel = self 21 | self.encoder = Encoder.build(args) 22 | 23 | def _forward(self, batch): 24 | (x_idxs, x_lengths), (y_idxs, y_lengths) = batch 25 | x_ret = self.encoder(x_idxs, x_lengths) 26 | y_ret = self.encoder(y_idxs, y_lengths) 27 | 28 | return self.loss(x_ret, y_ret) 29 | 30 | def _score(self, batch): 31 | (x_idxs, x_lengths), (y_idxs, y_lengths) = batch 32 | x_pooled, _ = self.encoder(x_idxs, x_lengths) 33 | y_pooled, _ = self.encoder(y_idxs, y_lengths) 34 | return F.cosine_similarity(x_pooled, y_pooled) 35 | 36 | def training_step(self, batch, batch_idx): 37 | loss = self._forward(batch).mean() 38 | self.log("loss/train", loss) 39 | return loss 40 | 41 | def _unlabeled_eval_step(self, batch, batch_idx): 42 | loss = self._forward(batch) 43 | return dict(loss=loss.detach().cpu()) 44 | 45 | def _labeled_eval_step(self, batch, batch_idx): 46 | *batch, labels = batch 47 | scores = self._score(batch) 48 | return dict(scores=scores.detach().cpu(), labels=labels.detach().cpu()) 49 | 50 | def _shared_eval_step(self, batch, batch_idx): 51 | if len(batch) == 3: 52 | return self._labeled_eval_step(batch, batch_idx) 53 | elif len(batch) == 2: 54 | return self._unlabeled_eval_step(batch, batch_idx) 55 | 56 | def _unlabeled_epoch_end(self, outputs, prefix): 57 | loss = torch.cat([o["loss"] for o in outputs]).mean() 58 | self.log(f"loss/{prefix}", loss) 59 | 60 | def _labeled_epoch_end(self, outputs, prefix): 61 | scores = torch.cat([o["scores"] for o in outputs]).tolist() 62 | labels = torch.cat([o["labels"] for o in outputs]).tolist() 63 | self.log(f"pearsonr/{prefix}", pearsonr(scores, labels)[0]) 64 | self.log(f"spearmanr/{prefix}", spearmanr(scores, labels).correlation) 65 | 66 | def _shared_epoch_end(self, outputs, prefix): 67 | if "labels" in outputs[0]: 68 | self._labeled_epoch_end(outputs, prefix) 69 | else: 70 | self._unlabeled_epoch_end(outputs, prefix) 71 | 72 | def validation_step(self, batch, batch_idx): 73 | return self._shared_eval_step(batch, batch_idx) 74 | 75 | def test_step(self, batch, batch_idx, dataloader_idx=0): 76 | return self._shared_eval_step(batch, batch_idx) 77 | 78 | def validation_epoch_end(self, outputs): 79 | self._shared_epoch_end( 80 | outputs, 81 | f"val_{os.path.basename(self.datamodule.val_dataloader().dataset.data_file)}", 82 | ) 83 | 84 | def test_epoch_end(self, outputs): 85 | if isinstance(outputs[0], list): 86 | for idx, subset_outputs in enumerate(outputs): 87 | self._shared_epoch_end( 88 | subset_outputs, 89 | f"test_{os.path.basename(self.datamodule.test_dataloader()[idx].dataset.data_file)}", 90 | ) 91 | else: 92 | self._shared_epoch_end( 93 | outputs, 94 | f"test_{os.path.basename(self.datamodule.test_dataloader().dataset.data_file)}", 95 | ) 96 | 97 | def configure_optimizers(self): 98 | return {"bert": optim.AdamW}.get(self.args.model, optim.Adam)( 99 | self.parameters(), lr=self.args.lr 100 | ) 101 | -------------------------------------------------------------------------------- /varclr/models/tokenizers.py: -------------------------------------------------------------------------------- 1 | import sentencepiece as spm 2 | from transformers import AutoTokenizer 3 | 4 | 5 | class Tokenizer: 6 | @staticmethod 7 | def build(sp_model): 8 | if "sp.20k.model" in sp_model: 9 | return SPTokenizer(sp_model) 10 | elif "bert" in sp_model: 11 | return PretrainedTokenizer(sp_model) 12 | elif "split" in sp_model: 13 | return SplitTokenizer() 14 | else: 15 | raise NotImplementedError 16 | 17 | def encode(self, text): 18 | raise NotImplementedError 19 | 20 | 21 | class SplitTokenizer(Tokenizer): 22 | def encode(self, text): 23 | return text.strip().split() 24 | 25 | 26 | class SPTokenizer(Tokenizer): 27 | def __init__(self, model_path) -> None: 28 | self.sp = spm.SentencePieceProcessor() 29 | self.sp.Load(model_path) 30 | 31 | def encode(self, text): 32 | return self.sp.EncodeAsPieces(text) 33 | 34 | 35 | class PretrainedTokenizer(Tokenizer): 36 | 37 | _instance = None 38 | 39 | @staticmethod 40 | def get_instance(): 41 | return PretrainedTokenizer._instance 42 | 43 | @staticmethod 44 | def set_instance(tokenizer_name): 45 | PretrainedTokenizer._instance = AutoTokenizer.from_pretrained(tokenizer_name) 46 | 47 | def __init__(self, tokenizer_name) -> None: 48 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 49 | 50 | def encode(self, text): 51 | return list( 52 | map( 53 | str, 54 | self.tokenizer.encode(text, add_special_tokens=False, truncation=True), 55 | ) 56 | ) 57 | -------------------------------------------------------------------------------- /varclr/models/urls_pretrained_model.py: -------------------------------------------------------------------------------- 1 | PRETRAINED_TOKENIZER = "microsoft/codebert-base" 2 | 3 | PRETRAINED_CODEBERT_URL = ( 4 | "https://drive.google.com/uc?id=1xl8kdQtJ7ke4jyv5kHDiOc5dScPTTKzg" 5 | ) 6 | PRETRAINED_CODEBERT_FOLDER = "varclr_bert" 7 | PRETRAINED_CODEBERT_MD5 = "3844bd6e76a928084b0d742ac120a91c" 8 | 9 | PRETRAINED_AVG_URL = "https://drive.google.com/uc?id=1IFWvFQ2YKvCNRroy2RBqwSeQPGhHShX7" 10 | PRETRAINED_AVG_FOLDER = "varclr_avg" 11 | PRETRAINED_AVG_MD5 = "97ca667fac013b9a93fb87e91c0c3a0c" 12 | 13 | PRETRAINED_LSTM_URL = "https://drive.google.com/uc?id=1GZ9v0Zt4RazR1STBac8W116F-0bsRLzf" 14 | PRETRAINED_LSTM_FOLDER = "varclr_lstm" 15 | PRETRAINED_LSTM_MD5 = "a368f514ec16e45a58cbc94c67c67b80" 16 | -------------------------------------------------------------------------------- /varclr/pretrain.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import pytorch_lightning as pl 7 | import torch 8 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint 9 | from pytorch_lightning.loggers import WandbLogger 10 | from transformers import AutoModel 11 | 12 | from varclr.data.dataset import RenamesDataModule 13 | from varclr.models.model import Model 14 | from varclr.models.tokenizers import PretrainedTokenizer 15 | from varclr.utils.options import add_options 16 | 17 | if __name__ == "__main__": 18 | 19 | parser = argparse.ArgumentParser() 20 | add_options(parser) 21 | args = parser.parse_args() 22 | random.seed(args.seed) 23 | np.random.seed(args.seed) 24 | torch.manual_seed(args.seed) 25 | 26 | dm = RenamesDataModule( 27 | args.train_data_file, args.valid_data_file, args.test_data_files, args 28 | ) 29 | if not os.path.exists(args.vocab_path): 30 | dm.setup() 31 | 32 | model = Model(args) 33 | if args.load_file is not None: 34 | model = model.load_from_checkpoint(args.load_file, args=args, strict=False) 35 | model.datamodule = dm 36 | 37 | if not args.test and "bert" in args.sp_model and args.model != "bert": 38 | # Load pre-trained word embeddings from bert 39 | bert = AutoModel.from_pretrained(args.sp_model) 40 | for word, idx in torch.load(args.vocab_path).items(): 41 | try: 42 | model.encoder.embedding.weight.data[ 43 | idx 44 | ] = bert.embeddings.word_embeddings.weight.data[int(word)] 45 | except ValueError: 46 | pass 47 | del bert 48 | if "bert" in args.model: 49 | PretrainedTokenizer.set_instance(args.bert_model) 50 | 51 | if args.valid_data_file is not None: 52 | callbacks = [ 53 | EarlyStopping( 54 | monitor=f"spearmanr/val_{os.path.basename(dm.valid_data_file)}", 55 | mode="max", 56 | patience=args.patience, 57 | ), 58 | ModelCheckpoint( 59 | monitor=f"spearmanr/val_{os.path.basename(dm.valid_data_file)}", 60 | mode="max", 61 | ), 62 | ] 63 | else: 64 | callbacks = [ 65 | EarlyStopping( 66 | monitor=f"loss/val_{os.path.basename(dm.train_data_file)}", 67 | patience=args.patience, 68 | ), 69 | ModelCheckpoint(monitor=f"loss/val_{os.path.basename(dm.train_data_file)}"), 70 | ] 71 | 72 | wandb_logger = WandbLogger(name=args.name, project="varclr", log_model=True) 73 | wandb_logger.log_hyperparams(args) 74 | args = argparse.Namespace(**wandb_logger.experiment.config) 75 | trainer = pl.Trainer( 76 | max_epochs=args.epochs, 77 | logger=wandb_logger, 78 | gpus=args.gpu, 79 | auto_select_gpus=args.gpu > 0, 80 | gradient_clip_val=args.grad_clip, 81 | callbacks=callbacks, 82 | progress_bar_refresh_rate=10, 83 | val_check_interval=0.25, 84 | limit_train_batches=args.limit_train_batches, 85 | ) 86 | 87 | if not args.test: 88 | trainer.fit(model, datamodule=dm) 89 | # will automatically load and test the best checkpoint instead of the last model 90 | trainer.test(datamodule=dm) 91 | else: 92 | # save in hf transformer ckpt format 93 | trainer.test(model, datamodule=dm) 94 | -------------------------------------------------------------------------------- /varclr/utils/find_nn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from infer import MockArgs 4 | from varclr.data.preprocessor import CodePreprocessor 5 | 6 | if __name__ == "__main__": 7 | ret = torch.load("saved") 8 | vars, embs = ret["vars"], ret["embs"] 9 | var2idx = dict([(var, idx) for idx, var in enumerate(vars)]) 10 | # while (line := input()) != "": 11 | processor = CodePreprocessor(MockArgs()) 12 | for line in [ 13 | "substr", 14 | "item", 15 | "count", 16 | "rows", 17 | "setInterval", 18 | "minText", 19 | "files", 20 | "miny", 21 | ]: 22 | line = "".join( 23 | [ 24 | word.capitalize() if idx > 0 else word 25 | for idx, word in enumerate(processor(line.strip()).split()) 26 | ] 27 | ) 28 | if line not in var2idx: 29 | print("variable not found") 30 | continue 31 | result = torch.topk(embs @ embs[var2idx[line]], k=21) 32 | print([vars[idx] for idx in result.indices][1:]) 33 | -------------------------------------------------------------------------------- /varclr/utils/gen_typos.py: -------------------------------------------------------------------------------- 1 | import nlpaug.augmenter.char as nac 2 | import numpy as np 3 | 4 | if __name__ == "__main__": 5 | with open("var.txt") as f: 6 | variables = [line.strip() for line in f.readlines()] 7 | np.random.seed(42) 8 | variables = np.random.choice(variables, 1024) 9 | aug = nac.KeyboardAug( 10 | aug_char_max=1, 11 | include_special_char=False, 12 | include_numeric=False, 13 | include_upper_case=False, 14 | ) 15 | with open("typo_corr.txt", "w") as f, open("typo_var.txt", "w") as f_var: 16 | for variable in variables: 17 | aug_var = aug.augment(variable) 18 | variable, aug_var = variable.replace(" ", ""), aug_var.replace(" ", "") 19 | f.write(f"{aug_var} {variable}\n") 20 | f_var.write(f"{aug_var}\n") 21 | -------------------------------------------------------------------------------- /varclr/utils/infer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from tqdm import tqdm 5 | from transformers import AutoTokenizer, AutoModel 6 | 7 | from varclr.data.preprocessor import CodePreprocessor 8 | 9 | 10 | def forward(model, input_ids, attention_mask): 11 | output = model( 12 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True 13 | ) 14 | all_hids = output.hidden_states 15 | pooled = all_hids[-4][:, 0] 16 | return pooled 17 | 18 | 19 | class MockArgs: 20 | def __init__(self): 21 | self.tokenization = "" 22 | 23 | 24 | def batcher(batch_size): 25 | uniq = set() 26 | with open(sys.argv[1]) as f: 27 | vars = [] 28 | for var in f: 29 | var = processor(var.strip()) 30 | if var not in uniq: 31 | uniq.add(var) 32 | vars.append(var) 33 | if len(vars) == batch_size: 34 | yield vars 35 | vars = [] 36 | yield vars 37 | 38 | 39 | if __name__ == "__main__": 40 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 41 | tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base") 42 | model = AutoModel.from_pretrained("bert_saved/") 43 | model.to(device) 44 | processor = CodePreprocessor(MockArgs()) 45 | ret_dict = dict(vars=[], embs=[]) 46 | for idx, vars in enumerate(tqdm(batcher(64))): 47 | ret = tokenizer(vars, return_tensors="pt", padding=True) 48 | embs = ( 49 | forward( 50 | model, ret["input_ids"].to(device), ret["attention_mask"].to(device) 51 | ) 52 | .detach() 53 | .cpu() 54 | ) 55 | ret_dict["vars"].extend( 56 | [ 57 | "".join( 58 | [ 59 | word.capitalize() if idx > 0 else word 60 | for idx, word in enumerate(var.split()) 61 | ] 62 | ) 63 | for var in vars 64 | ] 65 | ) 66 | ret_dict["embs"].extend(embs) 67 | ret_dict["embs"] = torch.stack(ret_dict["embs"]) 68 | print(len(ret_dict["vars"])) 69 | print(ret_dict["embs"].shape) 70 | torch.save(ret_dict, "saved") 71 | -------------------------------------------------------------------------------- /varclr/utils/infer_avg.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import random 4 | import numpy as np 5 | import torch 6 | from torch.nn.utils.rnn import pad_sequence 7 | from tqdm import tqdm 8 | from transformers import AutoTokenizer, AutoModel 9 | 10 | from models import Model 11 | from varclr.data.vocab import Vocab 12 | from varclr.data.preprocessor import CodePreprocessor 13 | from varclr.utils.options import add_options 14 | 15 | 16 | def forward(model, input_ids, attention_mask): 17 | output = model( 18 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True 19 | ) 20 | all_hids = output.hidden_states 21 | pooled = all_hids[-4][:, 0] 22 | return pooled 23 | 24 | 25 | class MockArgs: 26 | def __init__(self): 27 | self.tokenization = "" 28 | 29 | 30 | def batcher(batch_size): 31 | uniq = set() 32 | with open("var.txt") as f: 33 | vars = [] 34 | for var in f: 35 | var_id = processor(var.strip()) 36 | var = processor2(var.strip()) 37 | if var_id not in uniq: 38 | uniq.add(var_id) 39 | vars.append((var_id, var)) 40 | if len(vars) == batch_size: 41 | yield zip(*vars) 42 | vars = [] 43 | yield zip(*vars) 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser() 48 | add_options(parser) 49 | args = parser.parse_args() 50 | random.seed(args.seed) 51 | np.random.seed(args.seed) 52 | torch.manual_seed(args.seed) 53 | tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base") 54 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 55 | model = Model(args) 56 | model = model.load_from_checkpoint(args.load_file, args=args, strict=False) 57 | model = model.to(device) 58 | model.eval() 59 | vocab = torch.load(args.vocab_path) 60 | 61 | processor = CodePreprocessor(args) 62 | processor2 = CodePreprocessor(MockArgs()) 63 | ret_dict = dict(vars=[], embs=[]) 64 | 65 | def torchify(batch): 66 | idxs = pad_sequence( 67 | [torch.tensor(ex, dtype=torch.long) for ex in batch], 68 | batch_first=True, 69 | ) 70 | lengths = torch.tensor([len(e) for e in batch], dtype=torch.long) 71 | return idxs, lengths 72 | 73 | for var_ids, vars in tqdm(batcher(64)): 74 | batch = torchify( 75 | [ 76 | [ 77 | Vocab.lookup(vocab, w, args.zero_unk) 78 | for w in var.split() 79 | if Vocab.lookup(vocab, w, args.zero_unk) is not None 80 | ] 81 | or [vocab[Vocab.unk_string]] 82 | for var in var_ids 83 | ] 84 | ) 85 | x_idxs, x_lengths = batch 86 | ret = model.encoder(x_idxs.to(device), x_lengths.to(device)) 87 | embs, _ = ret 88 | embs = embs.detach().cpu() 89 | ret_dict["vars"].extend( 90 | [ 91 | "".join( 92 | [ 93 | word.capitalize() if idx > 0 else word 94 | for idx, word in enumerate(var.split()) 95 | ] 96 | ) 97 | for var in vars 98 | ] 99 | ) 100 | ret_dict["embs"].extend(embs) 101 | ret_dict["embs"] = torch.stack(ret_dict["embs"]) 102 | print(len(ret_dict["vars"])) 103 | print(ret_dict["embs"].shape) 104 | torch.save(ret_dict, "saved_lstm") 105 | -------------------------------------------------------------------------------- /varclr/utils/infer_ft_cbow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | from tqdm import tqdm 5 | 6 | from utils import CodePreprocessor 7 | from varclr.data.preprocessor import CodePreprocessor 8 | 9 | 10 | def forward(model, input_ids, attention_mask): 11 | output = model( 12 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True 13 | ) 14 | all_hids = output.hidden_states 15 | pooled = all_hids[-4][:, 0] 16 | return pooled 17 | 18 | 19 | class MockArgs: 20 | def __init__(self): 21 | self.tokenization = "" 22 | 23 | 24 | def batcher(batch_size): 25 | uniq = set() 26 | with open(sys.argv[1]) as f: 27 | vars = [] 28 | for uncanon_var in f: 29 | uncanon_var = uncanon_var.strip() 30 | var = processor(uncanon_var) 31 | if var not in uniq: 32 | uniq.add(var) 33 | vars.append((var, uncanon_var)) 34 | if len(vars) == batch_size: 35 | yield list(zip(*vars)) 36 | vars = [] 37 | yield list(zip(*vars)) 38 | 39 | 40 | def read_embs(fname): 41 | all_embs = {} 42 | with open(fname) as f: 43 | for line in f: 44 | if not '"ID:' in line: 45 | continue 46 | name, *emb = line.strip().split() 47 | name = name[1 : 1 + name[1:].index('"')] 48 | all_embs[name] = torch.tensor(list(map(float, emb))) 49 | return all_embs 50 | 51 | 52 | if __name__ == "__main__": 53 | processor = CodePreprocessor(MockArgs()) 54 | ret_dict = dict(vars=[], embs=[]) 55 | all_embs = read_embs(sys.argv[2]) 56 | for vars, uncanon_vars in tqdm(batcher(64)): 57 | embs = (all_embs[f"ID:{v}"] for v in uncanon_vars) 58 | ret_dict["vars"].extend( 59 | [ 60 | "".join( 61 | [ 62 | word.capitalize() if idx > 0 else word 63 | for idx, word in enumerate(var.split()) 64 | ] 65 | ) 66 | for var in vars 67 | ] 68 | ) 69 | ret_dict["embs"].extend(embs) 70 | ret_dict["embs"] = torch.stack(ret_dict["embs"]) 71 | print(len(ret_dict["vars"])) 72 | print(ret_dict["embs"].shape) 73 | torch.save(ret_dict, "saved_ft") 74 | -------------------------------------------------------------------------------- /varclr/utils/options.py: -------------------------------------------------------------------------------- 1 | def add_options(parser): 2 | # fmt: off 3 | # Dataset 4 | parser.add_argument("--train-data-file", default="cs-cs.var.tok.txt", help="training data") 5 | parser.add_argument("--valid-data-file", default=None, type=str, help="validation data") 6 | parser.add_argument("--test-data-files", default="varclr/benchmarks/idbench/small_pair_wise.csv,varclr/benchmarks/idbench/medium_pair_wise.csv,varclr/benchmarks/idbench/large_pair_wise.csv", help="test data") 7 | parser.add_argument("--zero-unk", default=1, type=int, help="whether to ignore unknown tokens") 8 | parser.add_argument("--ngrams", default=3, type=int, help="whether to use character n-grams") 9 | parser.add_argument("--tokenization", default="sp", type=str, choices=["sp", "ngrams"], help="which tokenization to use") 10 | parser.add_argument("--sp-model", default="microsoft/codebert-base-mlm", help="SP model to load for evaluation") 11 | parser.add_argument("--vocab-path", default="cs-cs.var.tok.txt.codebert.vocab", type=str, help="Path to vocabulary") 12 | parser.add_argument("--num-workers", default=4, type=int, help="Path to vocabulary") 13 | 14 | # Model 15 | parser.add_argument("--model", default="avg", choices=["avg", "lstm", "attn", "bert"], help="type of base model to train.") 16 | parser.add_argument("--bert-model", default="microsoft/codebert-base-mlm", help="type of bert model to load.") 17 | parser.add_argument("--dim", default=768, type=int, help="dimension of input embeddings") 18 | parser.add_argument("--hidden-dim", default=150, type=int, help="hidden dim size of LSTM") 19 | parser.add_argument("--scramble-rate", default=0, type=float, help="rate of scrambling in for LSTM") 20 | parser.add_argument("--delta", default=0.4, type=float, help="margin size for margin ranking loss") 21 | parser.add_argument("--nce-t", default=0.05, type=float, help="temperature for noise contrastive estimation loss") 22 | parser.add_argument("--temperature", default=100, type=float, help="temperature for biattn scorer") 23 | parser.add_argument("--last-n-layer-output", default=1, type=int, help="last layer representation used as output") 24 | 25 | # Training 26 | parser.add_argument("--name", default="Ours-FT", help="method name") 27 | parser.add_argument("--gpu", default=1, type=int, help="whether to train on gpu") 28 | parser.add_argument("--grad-clip", default=1., type=float, help='clip threshold of gradients') 29 | parser.add_argument("--epochs", default=300, type=int, help="number of epochs to train") 30 | parser.add_argument("--limit-train-batches", default=1.0, type=float, help="number of batches for each training epoch") 31 | parser.add_argument("--patience", default=40, type=int, help="early stopping patience") 32 | parser.add_argument("--lr", default=0.001, type=float, help="learning rate") 33 | parser.add_argument("--dropout", default=0.5, type=float, help="dropout rate") 34 | parser.add_argument("--batch-size", default=1024, type=int, help="size of batches") 35 | parser.add_argument("--load-file", help="filename to load a pretrained model.") 36 | parser.add_argument("--test", action="store_true", help="only do evaluation") 37 | parser.add_argument("--train-percent", default=1.0, type=float, help="percentage of data used for training") 38 | parser.add_argument("--seed", default=42, type=int) 39 | # fmt: on 40 | -------------------------------------------------------------------------------- /varclr/utils/similarity_search.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | import torch 5 | 6 | from varclr.utils.infer import MockArgs 7 | from varclr.data.preprocessor import CodePreprocessor 8 | 9 | if __name__ == "__main__": 10 | ret = torch.load(sys.argv[2]) 11 | vars, embs = ret["vars"], ret["embs"] 12 | embs /= embs.norm(dim=1, keepdim=True) 13 | embs = embs.cuda() 14 | var2idx = dict([(var, idx) for idx, var in enumerate(vars)]) 15 | processor = CodePreprocessor(MockArgs()) 16 | Ks = [1, 5, 10, 25, 50, 100, 250, 500, 1000] 17 | topk_succ = defaultdict(int) 18 | tot = 0 19 | with open(sys.argv[1], "r") as f: 20 | for line in f: 21 | try: 22 | var1, var2 = line.strip().split() 23 | except ValueError: 24 | print("skpped: ", line) 25 | 26 | def canon(var): 27 | return "".join( 28 | [ 29 | word.capitalize() if idx > 0 else word 30 | for idx, word in enumerate(processor(var).split()) 31 | ] 32 | ) 33 | 34 | var1, var2 = canon(var1), canon(var2) 35 | if var1 not in var2idx or var2 not in var2idx: 36 | print(f"variable {var1} or {var2} not found") 37 | continue 38 | tot += 1 39 | for k in Ks: 40 | result = torch.topk(embs @ embs[var2idx[var1]], k=k + 1) 41 | topk_succ[k] += var2 in [vars[idx] for idx in result.indices][1:] 42 | 43 | print(f"Total {tot} variable pairs") 44 | for k in Ks: 45 | print(f"Recall@{k} = {100 * topk_succ[k] / tot:.1f}") 46 | --------------------------------------------------------------------------------