├── .github
└── workflows
│ ├── install.yml
│ ├── pretrain.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── cs-cs.var.tok.txt
├── cs-cs.var.tok.txt.codebert.vocab
├── docs
└── _static
│ └── images
│ ├── squareslab.png
│ ├── strudel.png
│ ├── training.jpeg
│ └── training.png
├── setup.py
├── tests
└── reproduce
│ └── test_idbench.py
└── varclr
├── __init__.py
├── benchmarks
├── __init__.py
├── benchmark.py
└── idbench
│ ├── large_pair_wise.csv
│ ├── medium_pair_wise.csv
│ └── small_pair_wise.csv
├── data
├── __init__.py
├── dataset.py
├── preprocessor.py
└── vocab.py
├── models
├── __init__.py
├── encoders.py
├── loss.py
├── model.py
├── tokenizers.py
└── urls_pretrained_model.py
├── pretrain.py
└── utils
├── find_nn.py
├── gen_typos.py
├── infer.py
├── infer_avg.py
├── infer_ft_cbow.py
├── options.py
└── similarity_search.py
/.github/workflows/install.yml:
--------------------------------------------------------------------------------
1 | name: Install
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | workflow_dispatch:
10 |
11 | jobs:
12 | install:
13 | # The type of runner that the job will run on
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ubuntu-20.04]
19 | python-version: [3.6, 3.9]
20 |
21 | steps:
22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
23 | - uses: actions/checkout@v2
24 | - uses: actions/setup-python@v2
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Upgrade pip
29 | run: python -m pip install --upgrade pip setuptools wheel
30 | - name: Install dependencies
31 | run: pip install -e .
32 |
--------------------------------------------------------------------------------
/.github/workflows/pretrain.yml:
--------------------------------------------------------------------------------
1 | name: Pretrain
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | workflow_dispatch:
10 |
11 | jobs:
12 | test:
13 | # The type of runner that the job will run on
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ubuntu-20.04]
19 | python-version: [3.6, 3.9]
20 |
21 | steps:
22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
23 | - uses: actions/checkout@v2
24 | - uses: actions/setup-python@v2
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Upgrade pip
29 | run: python -m pip install --upgrade pip setuptools wheel
30 | - name: Install dependencies
31 | run: pip install -e .
32 | - name: Disable wandb
33 | run: wandb disabled
34 | - name: Try pretrain avg
35 | run: python -m varclr.pretrain --model avg --name varclr-avg --epochs 1 --limit-train-batches 0.1 --gpu 0
36 | - name: Try pretrain lstm
37 | run: python -m varclr.pretrain --model lstm --name varclr-lstm --epochs 1 --limit-train-batches 0.1 --gpu 0
38 | - name: Try pretrain bert
39 | run: python -m varclr.pretrain --model bert --name varclr-codebert --epochs 1 --limit-train-batches 0.01 --sp-model split --last-n-layer-output 4 --batch-size 64 --lr 1e-5 --gpu 0
40 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | workflow_dispatch:
10 |
11 | jobs:
12 | test:
13 | # The type of runner that the job will run on
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | os: [ubuntu-20.04]
19 | python-version: [3.6, 3.9]
20 |
21 | steps:
22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
23 | - uses: actions/checkout@v2
24 | - uses: actions/setup-python@v2
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Upgrade pip
29 | run: python -m pip install --upgrade pip setuptools wheel
30 | - name: Install dependencies
31 | run: pip install -e .
32 | - name: Test reproduce paper results
33 | run: python -m pytest tests -v
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 |
140 | # Pre-trained models
141 | saved/
142 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v3.2.0
6 | hooks:
7 | - id: end-of-file-fixer
8 | - id: check-yaml
9 | - repo: https://github.com/pycqa/isort
10 | rev: 5.8.0
11 | hooks:
12 | - id: isort
13 | name: isort (python)
14 | - id: isort
15 | name: isort (cython)
16 | types: [cython]
17 | - id: isort
18 | name: isort (pyi)
19 | types: [pyi]
20 | - repo: https://github.com/psf/black
21 | rev: 21.10b0
22 | hooks:
23 | - id: black
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Qibin Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |

5 |
6 |
7 | ---
8 |
9 |
10 |
11 | [](https://github.com/squaresLab/VarCLR/actions/workflows/test.yml)
12 | [](https://github.com/squaresLab/VarCLR/stargazers)
13 | [](https://github.com/squaresLab/VarCLR/blob/master/LICENSE)
14 | [](https://github.com/ambv/black)
15 |
16 |
17 |
18 | ## [VarCLR: Variable Representation Pre-training via Contrastive Learning](https://arxiv.org/abs/2112.02650)
19 |
20 | **New:** Paper accepted by ICSE 2022. Preprint at [arXiv](https://arxiv.org/abs/2112.02650)!
21 |
22 | This repository contains code and pre-trained models for VarCLR, a contrastive learning based approach for learning semantic representations of variable names that effectively captures variable similarity, with state-of-the-art results on [IdBench@ICSE2021](https://conf.researchr.org/details/icse-2021/icse-2021-papers/3/IdBench-Evaluating-Semantic-Representations-of-Identifier-Names-in-Source-Code).
23 |
24 | - [VarCLR: Variable Representation Pre-training via Contrastive Learning](#varclr-variable-representation-pre-training-via-contrastive-learning)
25 | - [Step 0: Install](#step-0-install)
26 | - [Step 1: Load a Pre-trained VarCLR Model](#step-1-load-a-pre-trained-varclr-model)
27 | - [Step 2: VarCLR Variable Embeddings](#step-2-varclr-variable-embeddings)
28 | - [Get embedding of one variable](#get-embedding-of-one-variable)
29 | - [Get embeddings of list of variables (supports batching)](#get-embeddings-of-list-of-variables-supports-batching)
30 | - [Step 2: Get VarCLR Similarity Scores](#step-2-get-varclr-similarity-scores)
31 | - [Get similarity scores of N variable pairs](#get-similarity-scores-of-n-variable-pairs)
32 | - [Get pairwise (N * M) similarity scores from two lists of variables](#get-pairwise-n--m-similarity-scores-from-two-lists-of-variables)
33 | - [Step 3: Reproduce IdBench Benchmark Results](#step-3-reproduce-idbench-benchmark-results)
34 | - [Load the IdBench benchmark](#load-the-idbench-benchmark)
35 | - [Compute VarCLR scores and evaluate](#compute-varclr-scores-and-evaluate)
36 | - [Let's compare with the original CodeBERT](#lets-compare-with-the-original-codebert)
37 | - [Pre-train your own VarCLR models](#pre-train-your-own-varclr-models)
38 | - [Results on IdBench benchmarks](#results-on-idbench-benchmarks)
39 | - [Similarity](#similarity)
40 | - [Relatedness](#relatedness)
41 | - [Cite](#cite)
42 |
43 | ### Step 0: Install
44 |
45 | ```bash
46 | pip install -e .
47 | ```
48 |
49 | ### Step 1: Load a Pre-trained VarCLR Model
50 |
51 | ```python
52 | from varclr.models.model import Encoder
53 | model = Encoder.from_pretrained("varclr-codebert")
54 | ```
55 |
56 | ### Step 2: VarCLR Variable Embeddings
57 |
58 | #### Get embedding of one variable
59 |
60 | ```python
61 | emb = model.encode("squareslab")
62 | print(emb.shape)
63 | # torch.Size([1, 768])
64 | ```
65 |
66 | #### Get embeddings of list of variables (supports batching)
67 |
68 | ```python
69 | emb = model.encode(["squareslab", "strudel"])
70 | print(emb.shape)
71 | # torch.Size([2, 768])
72 | ```
73 |
74 | ### Step 2: Get VarCLR Similarity Scores
75 |
76 | #### Get similarity scores of N variable pairs
77 |
78 | ```python
79 | print(model.score("squareslab", "strudel"))
80 | # [0.42812108993530273]
81 | print(model.score(["squareslab", "average", "max", "max"], ["strudel", "mean", "min", "maximum"]))
82 | # [0.42812108993530273, 0.8849745988845825, 0.8035818338394165, 0.889922022819519]
83 | ```
84 |
85 | #### Get pairwise (N * M) similarity scores from two lists of variables
86 |
87 | ```python
88 | variable_list = ["squareslab", "strudel", "neulab"]
89 | print(model.cross_score("squareslab", variable_list))
90 | # [[1.0000007152557373, 0.4281214475631714, 0.7207341194152832]]
91 | print(model.cross_score(variable_list, variable_list))
92 | # [[1.0000007152557373, 0.4281214475631714, 0.7207341194152832],
93 | # [0.4281214475631714, 1.0000004768371582, 0.549992561340332],
94 | # [0.7207341194152832, 0.549992561340332, 1.000000238418579]]
95 | ```
96 |
97 | ### Step 3: Reproduce IdBench Benchmark Results
98 |
99 | #### Load the IdBench benchmark
100 |
101 | ```python
102 | from varclr.benchmarks import Benchmark
103 |
104 | # Similarity on IdBench-Medium
105 | b1 = Benchmark.build("idbench", variant="medium", metric="similarity")
106 | # Relatedness on IdBench-Large
107 | b2 = Benchmark.build("idbench", variant="large", metric="relatedness")
108 | ```
109 |
110 | #### Compute VarCLR scores and evaluate
111 |
112 | ```python
113 | id1_list, id2_list = b1.get_inputs()
114 | predicted = model.score(id1_list, id2_list)
115 | print(b1.evaluate(predicted))
116 | # {'spearmanr': 0.5248567181503295, 'pearsonr': 0.5249843473193132}
117 |
118 | print(b2.evaluate(model.score(*b2.get_inputs())))
119 | # {'spearmanr': 0.8012168379981921, 'pearsonr': 0.8021791703187449}
120 | ```
121 |
122 | #### Let's compare with the original [CodeBERT](https://github.com/microsoft/CodeBERT)
123 |
124 | ```python
125 | codebert = Encoder.from_pretrained("codebert")
126 | print(b1.evaluate(codebert.score(*b1.get_inputs())))
127 | # {'spearmanr': 0.2056582946575104, 'pearsonr': 0.1995058696927054}
128 | print(b2.evaluate(codebert.score(*b2.get_inputs())))
129 | # {'spearmanr': 0.3909218857993804, 'pearsonr': 0.3378219622284688}
130 | ```
131 |
132 | ### Pre-train your own VarCLR models
133 |
134 | You can pretrain and get the same VarCLR model variants with the following code.
135 |
136 | ```bash
137 | python -m varclr.pretrain --model avg --name varclr-avg
138 | python -m varclr.pretrain --model lstm --name varclr-lstm
139 | python -m varclr.pretrain --model bert --name varclr-codebert --sp-model split --last-n-layer-output 4 --batch-size 64 --lr 1e-5 --epochs 1
140 | ```
141 |
142 | The training progress and test results will be presented in the wandb dashboard. For reference, our training curves look like the following:
143 |
144 | 
145 |
146 | ### Results on [IdBench](https://conf.researchr.org/details/icse-2021/icse-2021-papers/3/IdBench-Evaluating-Semantic-Representations-of-Identifier-Names-in-Source-Code) benchmarks
147 |
148 | #### Similarity
149 |
150 | | Method | Small | Medium | Large |
151 | | ---------------- | -------- | -------- | -------- |
152 | | FT-SG | 0.30 | 0.29 | 0.28 |
153 | | LV | 0.32 | 0.30 | 0.30 |
154 | | FT-cbow | 0.35 | 0.38 | 0.38 |
155 | | VarCLR-Avg | 0.47 | 0.45 | 0.44 |
156 | | VarCLR-LSTM | 0.50 | 0.49 | 0.49 |
157 | | VarCLR-CodeBERT | **0.53** | **0.53** | **0.51** |
158 | | | | | |
159 | | Combined-IdBench | 0.48 | 0.59 | 0.57 |
160 | | Combined-VarCLR | **0.66** | **0.65** | **0.62** |
161 |
162 | #### Relatedness
163 |
164 | | Method | Small | Medium | Large |
165 | | ---------------- | -------- | -------- | -------- |
166 | | LV | 0.48 | 0.47 | 0.48 |
167 | | FT-SG | 0.70 | 0.71 | 0.68 |
168 | | FT-cbow | 0.72 | 0.74 | 0.73 |
169 | | VarCLR-Avg | 0.67 | 0.66 | 0.66 |
170 | | VarCLR-LSTM | 0.71 | 0.70 | 0.69 |
171 | | VarCLR-CodeBERT | **0.79** | **0.79** | **0.80** |
172 | | | | | |
173 | | Combined-IdBench | 0.71 | 0.78 | 0.79 |
174 | | Combined-VarCLR | **0.79** | **0.81** | **0.85** |
175 |
176 | ### Cite
177 |
178 | If you find VarCLR useful in your research, please cite our paper@ICSE2022:
179 |
180 | ```bibtex
181 | @inproceedings{ChenVarCLR2022,
182 | author = {Chen, Qibin and Lacomis, Jeremy and Schwartz, Edward J. and Neubig, Graham and Vasilescu, Bogdan and {Le~Goues}, Claire},
183 | title = {{VarCLR}: {Variable} Semantic Representation Pre-training via Contrastive Learning},
184 | booktitle = {International Conference on Software Engineering},
185 | year = {2022},
186 | series = {ICSE '22}
187 | }
188 | ```
189 |
--------------------------------------------------------------------------------
/cs-cs.var.tok.txt.codebert.vocab:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/cs-cs.var.tok.txt.codebert.vocab
--------------------------------------------------------------------------------
/docs/_static/images/squareslab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/squareslab.png
--------------------------------------------------------------------------------
/docs/_static/images/strudel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/strudel.png
--------------------------------------------------------------------------------
/docs/_static/images/training.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/training.jpeg
--------------------------------------------------------------------------------
/docs/_static/images/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/docs/_static/images/training.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="varclr",
5 | version="1.0",
6 | author="Qibin Chen",
7 | author_email="qibinc@andrew.cmu.edu",
8 | license="MIT",
9 | python_requires=">=3.6",
10 | packages=find_packages(exclude=[]),
11 | install_requires=[
12 | "black>=21.10b0",
13 | "gdown>=4.2.0",
14 | "isort>=5.8.0",
15 | "pandas>=1.1.0",
16 | "pre-commit>=2.15.0",
17 | "pytest>=6.2.4",
18 | "pytorch-lightning>=1.0.8,<1.3",
19 | "sentencepiece>=0.1.95",
20 | "scipy>=1.5.2",
21 | "torch>=1.7.1",
22 | "transformers==4.5.1",
23 | "wandb>=0.12.6",
24 | ],
25 | )
26 |
--------------------------------------------------------------------------------
/tests/reproduce/test_idbench.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from varclr.benchmarks import Benchmark
4 | from varclr.models.model import Encoder
5 |
6 |
7 | def test_codebert():
8 | model = Encoder.from_pretrained("varclr-codebert")
9 |
10 | paper_results = {
11 | ("small", "similarity"): 0.53,
12 | ("medium", "similarity"): 0.53,
13 | ("large", "similarity"): 0.51,
14 | ("small", "relatedness"): 0.79,
15 | ("medium", "relatedness"): 0.79,
16 | ("large", "relatedness"): 0.80,
17 | }
18 | for (variant, metric), expected in paper_results.items():
19 | b = Benchmark.build("idbench", variant=variant, metric=metric)
20 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"]
21 | assert np.allclose(actual, expected, atol=1e-2)
22 |
23 |
24 | def test_avg():
25 | model = Encoder.from_pretrained("varclr-avg")
26 |
27 | paper_results = {
28 | ("small", "similarity"): 0.47,
29 | ("medium", "similarity"): 0.45,
30 | ("large", "similarity"): 0.44,
31 | ("small", "relatedness"): 0.67,
32 | ("medium", "relatedness"): 0.66,
33 | ("large", "relatedness"): 0.66,
34 | }
35 | for (variant, metric), expected in paper_results.items():
36 | b = Benchmark.build("idbench", variant=variant, metric=metric)
37 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"]
38 | assert np.allclose(actual, expected, atol=1e-2)
39 |
40 |
41 | def test_lstm():
42 | model = Encoder.from_pretrained("varclr-lstm")
43 |
44 | paper_results = {
45 | ("small", "similarity"): 0.50,
46 | ("medium", "similarity"): 0.49,
47 | ("large", "similarity"): 0.49,
48 | ("small", "relatedness"): 0.71,
49 | ("medium", "relatedness"): 0.70,
50 | ("large", "relatedness"): 0.69,
51 | }
52 | for (variant, metric), expected in paper_results.items():
53 | b = Benchmark.build("idbench", variant=variant, metric=metric)
54 | actual = b.evaluate(model.score(*b.get_inputs()))["spearmanr"]
55 | assert np.allclose(actual, expected, atol=1e-2)
56 |
--------------------------------------------------------------------------------
/varclr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/varclr/__init__.py
--------------------------------------------------------------------------------
/varclr/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | from varclr.benchmarks.benchmark import Benchmark
2 |
3 | __all__ = ["Benchmark"]
4 |
--------------------------------------------------------------------------------
/varclr/benchmarks/benchmark.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 | from typing import Dict, List, Tuple
4 |
5 | import pandas as pd
6 | from scipy.stats import pearsonr, spearmanr
7 |
8 |
9 | class Benchmark:
10 | @staticmethod
11 | def build(benchmark: str, **kwargs):
12 | return {"idbench": IdBench}[benchmark](**kwargs)
13 |
14 | def get_inputs(self):
15 | raise NotImplementedError
16 |
17 | def get_labels(self):
18 | raise NotImplementedError
19 |
20 | def evaluate(self, predictions) -> Dict[str, float]:
21 | raise NotImplementedError
22 |
23 |
24 | class IdBench(Benchmark):
25 |
26 | BASELINES = ["FT-cbow", "FT-SG", "w2v-SG", "w2v-cbow", "Path-based"]
27 |
28 | def __init__(self, variant: str, metric: str) -> None:
29 | super().__init__()
30 | assert variant in {"small", "medium", "large"}
31 | assert metric in {"similarity", "relatedness"}
32 | self.variant = variant
33 | self.metric = metric
34 |
35 | pairs = pd.read_csv(
36 | os.path.join(
37 | pathlib.Path(__file__).parent.resolve(),
38 | f"idbench/{self.variant}_pair_wise.csv",
39 | ),
40 | )
41 |
42 | df = pairs[
43 | pairs.apply(
44 | lambda r: r[self.metric] != "NAN"
45 | and all(r[b] != "NAN" for b in IdBench.BASELINES),
46 | axis=1,
47 | )
48 | ]
49 | self.varlist1 = df["id1"].tolist()
50 | self.varlist2 = df["id2"].tolist()
51 | self.scores = df[self.metric].astype(float).tolist()
52 |
53 | def get_inputs(self) -> Tuple[List[str], List[str]]:
54 | return self.varlist1, self.varlist2
55 |
56 | def get_labels(self) -> List[float]:
57 | return self.scores
58 |
59 | def evaluate(self, predictions) -> Dict[str, float]:
60 | return {
61 | "spearmanr": spearmanr(predictions, self.scores).correlation,
62 | "pearsonr": pearsonr(predictions, self.scores)[0],
63 | }
64 |
--------------------------------------------------------------------------------
/varclr/benchmarks/idbench/large_pair_wise.csv:
--------------------------------------------------------------------------------
1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW
2 | i,targ,0.3125,0.1718,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12
3 | expr,hint,0.1457,0.2155,NAN,0.11,0.42,0.21,0.25,0.36,0.0,0.5
4 | canvas,video,0.4167,0.7168,0.8173,0.48,0.57,0.49,0.24,0.78,0.0,0.42
5 | idx,indx,0.8542,0.9129,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75
6 | idx,ridx,0.4167,0.6948,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75
7 | right,bottom,0.078,0.7547,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42
8 | count,total,0.8088,0.8308,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6
9 | click,mousedown,0.7368,0.9448,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28
10 | change,keyup,0.3542,0.4987,NAN,0.55,0.65,0.54,0.55,0.81,0.0,0.42
11 | change,submit,0.327,0.4367,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5
12 | files,players,0.1168,0.1457,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5
13 | focus,resize,0.1332,0.3724,NAN,0.3,0.62,0.6,0.38,0.8,0.0,0.42
14 | reset,refresh,0.639,0.8112,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57
15 | pushStackLiteral,oldSelection,0.05,0.1807,0.231,0.13,0.27,0.32,0.31,0.55,0.12,0.44
16 | onAdd,onRemove,0.0192,0.9396,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44
17 | black,colours,0.077,0.8593,0.242,0.42,0.74,0.72,-0.12,0.53,0.14,0.43
18 | maroon,darkred,0.7115,0.819,NAN,0.45,0.73,0.71,0.15,0.84,0.29,0.57
19 | cosφ0,cosφ,0.625,0.891,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8
20 | allocate,contextmenu,0.1138,0.1441,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45
21 | response,alert,0.3087,0.6002,0.5999,0.33,0.53,0.46,0.14,0.31,0.0,0.31
22 | filename,fullname,0.1785,0.2903,NAN,0.65,0.65,0.7,0.11,0.72,0.75,0.88
23 | objects,records,0.6428,0.6825,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57
24 | ln,ilen,0.6965,0.7571,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5
25 | tasks,todos,0.9525,0.9378,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7
26 | images,authors,0.0418,0.2445,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5
27 | editable,dropdown,0.0588,0.2769,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5
28 | sources,adapters,0.375,0.5703,0.841,0.34,0.46,0.3,0.23,0.69,0.25,0.56
29 | ReactDOMComponent,ReactTextComponent,0.544,0.7231,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86
30 | λ0,φ0,0.2668,0.6339,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75
31 | xMin,xMax,0.0207,0.9783,0.3719,0.9,0.95,0.97,0.38,0.9,0.5,0.75
32 | FunctionExpression,FunctionDeclaration,0.2885,0.819,NAN,0.88,0.82,0.9,0.37,0.96,0.58,0.76
33 | Lines,CurRange,0.3077,0.3763,0.7382,0.58,0.68,0.87,0.46,0.39,0.12,0.38
34 | foundMap,foundStarMap,0.6923,0.819,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67
35 | columns,cols,0.9583,0.9419,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57
36 | dm,_queueHooks,0.15,0.198,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09
37 | fuchsia,pink,0.7345,0.8692,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29
38 | maxLine,maxLineLength,0.5833,0.8039,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54
39 | ExpressionStatement,FunctionDeclaration,0.3157,0.6423,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53
40 | addCls,removeCls,0.1457,0.8256,NAN,0.85,0.92,0.96,0.71,0.96,0.33,0.5
41 | object2,overlapOnly,0.0588,0.0154,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36
42 | nameSegment,foundStarMap,0.125,0.0411,NAN,0.37,0.6,0.89,0.8,0.76,0.08,0.5
43 | _selection,_sel,0.9808,0.9799,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4
44 | alignCenter,alignMiddle,0.625,0.8692,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73
45 | alignTop,popupLocationBar,0.0973,0.2592,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34
46 | targetFrame,targetFrameName,0.8035,0.8878,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73
47 | angle,radians,0.5312,0.8692,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43
48 | miny,ymin,0.9615,0.7989,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62
49 | element,dropdown,0.2917,0.477,0.7578,0.41,0.56,0.52,0.47,0.7,0.0,0.44
50 | equal,eql,0.9723,0.971,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6
51 | item,entry,0.7763,0.7798,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5
52 | events,rchecked,0.1427,0.1595,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44
53 | image,polyline,0.2223,0.3609,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38
54 | img,thumb,0.6528,0.7531,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4
55 | player,peer,0.45,0.5991,0.966,0.53,0.64,0.51,0.23,0.68,0.5,0.58
56 | files,profiles,0.2668,0.3549,0.5678,0.58,0.6,0.42,-0.01,0.77,0.62,0.62
57 | reset,clear,0.897,0.9077,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5
58 | username,userid,0.603,0.954,0.8685,0.72,0.79,0.67,0.15,0.65,0.5,0.62
59 | clear,refresh,0.4738,0.711,0.9717,0.55,0.66,0.61,0.35,0.81,0.14,0.43
60 | disabled,Tracker,0.05,0.1457,NAN,0.16,0.36,0.19,0.04,0.28,0.25,0.56
61 | olive,darkred,0.0938,0.6077,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43
62 | selectAnchor,anchorName,0.422,0.7385,0.9745,0.6,0.73,0.77,0.43,0.99,0.08,0.46
63 | names,filenames,0.6562,0.8039,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56
64 | setInterval,clearInterval,0.092,0.8624,0.3411,0.89,0.9,0.96,0.53,0.9,0.69,0.77
65 | getRules,foldingRules,0.375,0.6224,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54
66 | self_msgs,they_effects,0.0578,0.0748,0.2684,0.5,0.83,0.9,0.3,0.89,0.25,0.5
67 | getInstanceProp,setInstanceProp,0.0,0.8954,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97
68 | emptyText,blankText,0.8678,0.9077,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72
69 | minText,maxText,0.0735,0.9077,0.1279,0.95,0.9,0.93,0.65,0.99,0.71,0.86
70 | maxText,disabledDaysText,0.0167,0.0586,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38
71 | disabledDaysText,disabledDatesText,0.3905,0.8692,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91
72 | keywordMapper,buildinConstants,0.1332,0.1284,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41
73 | VM,invokePartial,0.0695,0.1284,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08
74 | blendMode,currentBlendMode,0.75,0.9038,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53
75 | touchmove,touchend,0.2778,0.8256,0.6156,0.91,0.94,0.96,0.27,0.97,0.56,0.72
76 | bindBuffer,ARRAY_BUFFER,0.3472,0.6658,0.625,0.51,0.81,0.96,0.72,0.65,0.08,0.46
77 | traverseContext,mapResult,0.3958,0.4553,NAN,0.36,0.58,0.84,0.25,0.68,0.2,0.4
78 | _owner,nextElement,0.0358,0.1595,0.1838,0.28,0.59,0.84,0.34,0.36,0.09,0.32
79 | m21,m22,0.1345,0.7385,NAN,0.98,0.96,0.95,0.48,0.94,0.67,0.83
80 | child,face,0.15,0.4247,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4
81 | displayMsg,emptyMsg,0.294,0.6925,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65
82 | pseudoElements,pseudoClasses,0.421,0.7246,0.851,0.71,0.75,0.78,0.74,0.99,0.57,0.75
83 | lastName,firstName,0.075,0.8954,0.3464,0.9,0.94,0.97,0.41,0.9,0.67,0.78
84 | Int16Array,Uint16Array,0.4625,0.9215,0.9195,0.91,0.91,0.94,0.18,0.83,0.82,0.86
85 | startSymbol,endSymbol,0.0375,0.9738,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68
86 | decrypt,ciphertext,0.3625,0.8039,0.8173,0.63,0.77,0.86,0.19,0.58,0.2,0.45
87 | rlocalProtocol,rurl,0.2738,0.5392,0.8272,0.66,0.6,0.67,0.47,0.91,0.21,0.25
88 | hSpace,popupFeatures,0.0375,0.1501,0.0,0.25,0.48,0.6,0.49,0.86,0.23,0.35
89 | linkTab,alignMiddle,0.1053,0.188,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45
90 | lockRatio,alignRight,0.0528,0.1467,0.231,0.47,0.45,0.57,0.13,0.74,0.2,0.55
91 | substr,substring,1.0,0.9419,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67
92 | columns,menus,0.327,0.5173,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5
93 | history,$ERROR,0.0155,0.1663,NAN,0.16,0.33,0.14,0.14,NAN,0.0,0.43
94 | deltaX,deltaY,0.2668,0.9129,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92
95 | MINUTE,SECOND,0.1405,0.9508,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5
96 | onDragStart,onDragEnd,0.1168,0.9825,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68
97 | body,agg,0.0938,0.1174,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38
98 | rows,pages,0.1833,0.5991,NAN,0.41,0.62,0.49,0.31,0.81,0.2,0.5
99 | store,storage,0.8333,0.9302,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71
100 | angle,theta,0.625,0.7571,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5
101 | foo,bar,0.5178,0.6077,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5
102 | DATE,MONTH,0.2308,0.9799,0.5943,0.65,0.75,0.83,0.34,0.89,0.2,0.5
103 | components,requests,0.0715,0.3277,NAN,0.34,0.44,0.26,0.22,0.77,0.3,0.55
104 | modal,calendar,0.1095,0.1663,NAN,0.27,0.44,0.33,0.16,0.6,0.25,0.44
105 | ids,tasks,0.3333,0.4595,NAN,0.38,0.58,0.31,0.26,0.8,0.2,0.4
106 | toolbar,preview,0.0333,0.0934,NAN,0.35,0.58,0.77,0.4,0.78,0.0,0.5
107 | $behaviour,foldingRules,0.1972,0.2568,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46
108 | a01,b01,0.4062,0.6893,NAN,0.85,0.94,0.9,0.44,0.94,0.67,0.83
109 | material,light,0.0657,0.2843,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31
110 | camera,texture,0.0832,0.2735,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5
111 | user,person,0.8947,0.8624,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42
112 | selected,active,0.7763,0.8486,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44
113 | rows,columns,0.0832,0.8839,0.2155,0.72,0.78,0.81,0.34,0.86,0.29,0.43
114 | files,problems,0.0555,0.0557,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5
115 | frames,markers,0.25,0.5497,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57
116 | objects,shortcuts,0.111,0.1428,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56
117 | cx,sx,0.3678,0.3847,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75
118 | tr,td,0.1,0.7385,NAN,0.81,0.82,0.8,0.56,0.77,0.5,0.75
119 | foo,trow,0.2678,0.1221,0.7743,0.22,0.62,-0.02,0.17,0.35,0.25,0.5
120 | bindBuffer,ELEMENT_ARRAY_BUFFER,0.2648,0.5078,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28
121 | navy,purple,0.2333,0.7037,0.5563,0.37,0.84,0.9,0.42,0.95,0.0,0.33
122 | host,author,0.5683,0.5722,NAN,0.38,0.54,0.21,0.13,0.57,0.17,0.33
123 | li,span,0.159,0.2869,NAN,0.53,0.73,0.59,0.37,0.82,0.0,0.25
124 | orange,pink,0.1345,0.9195,NAN,0.21,0.73,0.75,0.49,0.88,0.17,0.42
125 | CallExpression,BlockStatement,0.2142,0.4022,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5
126 | g,r,0.423,0.3363,0.8674,0.53,0.84,0.75,0.68,0.71,0.0,0.5
127 | data,azimuthal,0.206,0.154,0.401,0.28,0.45,0.14,0.02,0.44,0.22,0.33
128 | raw,movie,0.1527,0.3316,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3
129 | me,br,0.1322,0.1232,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5
130 | err,er,0.7115,0.7586,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67
131 | res,resp,0.6042,0.6077,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75
132 | content,newtext,0.3393,0.477,0.9284,0.35,0.57,0.34,0.22,0.51,0.43,0.71
133 | utils,util,0.8088,0.9846,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8
134 | renderer,screen,0.423,0.6177,0.8453,0.28,0.46,0.35,0.19,0.64,0.25,0.5
135 | maroon,olive,0.0535,0.5703,NAN,0.31,0.82,0.95,0.64,0.9,0.0,0.42
136 | olive,pink,0.044,0.5693,NAN,0.31,0.78,0.87,0.54,0.88,0.2,0.5
137 | λ0,λ1,0.2668,0.7385,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75
138 | arrayClass,boolClass,0.2353,0.677,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7
139 | paddingRight,paddingTop,0.0455,1.0,NAN,0.89,0.88,0.88,0.28,0.94,0.58,0.71
140 | expect,bp,0.05,0.0238,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25
141 | items,files,0.625,0.5892,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6
142 | disabled,visible,0.103,0.5384,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75
143 | round,sqrt,0.0578,0.5173,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4
144 | teal,lightgrey,0.25,0.7845,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28
145 | navy,lightblue,0.4117,0.8923,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22
146 | navy,lightgreen,0.125,0.6514,NAN,0.31,0.75,0.89,0.47,0.94,0.0,0.2
147 | navy,magenta,0.1765,0.8617,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36
148 | items,records,0.7345,0.8039,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43
149 | hide,blur,0.2708,0.5424,NAN,0.5,0.71,0.59,0.39,0.69,0.0,0.5
150 | VERSION,geoJson,0.0795,0.1085,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5
151 | Util,isParam,0.1,0.2155,NAN,0.27,0.52,0.29,0.35,0.4,0.0,0.29
152 | found,rawFunc,0.05,0.0934,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5
153 | a01,b11,0.175,0.4247,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67
154 | gray,silver,0.4667,0.9129,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33
155 | topLevelTarget,topLevelTargetID,0.6538,0.8392,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88
156 | defHeaders,defHeaderName,0.5312,0.9184,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73
157 | get,facets,0.1875,0.2155,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42
158 | equal,cut,0.0385,0.0947,0.2049,0.09,0.51,0.09,0.15,0.34,0.2,0.4
159 | start,begin,0.9423,0.9799,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5
160 | visible,showing,0.829,0.9038,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5
161 | dataMax,dataMin,0.125,0.9738,NAN,0.94,0.96,0.97,0.6,0.97,0.71,0.86
162 | len,ls,0.1965,0.3088,NAN,0.49,0.71,0.52,0.4,0.46,0.33,0.5
163 | items,ranges,0.25,0.3562,0.7983,0.24,0.55,0.38,0.18,0.78,0.17,0.5
164 | dispatchIDs,_dispatchIDs,0.828,0.9184,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92
165 | maroon,lightblue,0.0155,0.6731,0.1838,0.45,0.77,0.78,0.37,0.88,0.0,0.33
166 | remove,focus,0.0395,0.0502,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42
167 | b01,a03,0.1537,0.4569,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67
168 | getAnimation,depot,0.0735,0.0309,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29
169 | translateX,translateY,0.2293,0.9563,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95
170 | mountDepth,updateComponent,0.0625,0.063,0.2915,0.38,0.5,0.85,0.31,0.48,0.13,0.4
171 | left,bottom,0.125,0.6862,NAN,0.63,0.82,0.84,0.73,0.91,0.17,0.42
172 | nodes,filenames,0.2223,0.448,NAN,0.39,0.46,0.39,-0.08,0.63,0.33,0.44
173 | records,entries,0.8685,0.8624,1.0,0.41,0.53,0.52,0.14,0.76,0.29,0.57
174 | adjusted_scale,rangy,0.3077,0.4367,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18
175 | PLACEHOLDER,vertexFormat,0.1168,0.1807,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46
176 | vSpace,advisoryTitleInputLabel,0.0695,0.0557,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22
177 | styleSelectLabel,tag_h4,0.2812,0.4116,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22
178 | body,currants,0.1155,0.1953,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25
179 | alpha,rate,0.1138,0.1917,NAN,0.37,0.59,0.49,0.27,0.71,0.0,0.4
180 | indices,positions,0.6965,0.8133,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5
181 | keyup,deferId,0.1168,0.1109,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43
182 | getBorderWidth,getPadding,0.5295,0.8462,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54
183 | foo,abc,0.5833,0.2155,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5
184 | files,images,0.5,0.7385,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58
185 | layers,entries,0.2668,0.3724,0.7488,0.38,0.5,0.44,0.22,0.67,0.14,0.5
186 | generalTab,advancedTab,0.389,0.8839,0.0,0.63,0.76,0.91,0.77,0.98,0.27,0.59
187 | targetPopup,popupResizable,0.328,0.6569,0.6966,0.51,0.63,0.85,0.45,NAN,0.07,0.43
188 | start,searches,0.1168,0.1284,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5
189 | a24,a30,0.3462,0.7385,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67
190 | push,configure,0.047,0.1501,NAN,0.06,0.49,0.17,0.14,0.55,0.11,0.28
191 | len,ln,0.794,0.8462,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67
192 | left,top,0.05,0.7646,NAN,0.8,0.88,0.91,0.81,0.98,0.0,0.38
193 | lightgreen,lightgrey,0.1138,0.6433,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85
194 | self,that,0.297,0.4932,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5
195 | y,z,0.4265,0.477,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5
196 | push,ts,0.111,0.1138,NAN,0.12,0.56,0.29,0.01,0.24,0.25,0.38
197 | element,elm,0.8678,0.9231,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43
198 | id,userid,0.7648,0.8308,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33
199 | id,sessionid,0.7205,0.8462,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22
200 | expect,assume,0.6765,0.7539,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5
201 | callback,cb,0.8595,0.8692,NAN,0.65,0.7,0.78,0.48,0.83,0.25,0.25
202 | container,video,0.2082,0.3172,NAN,0.29,0.5,0.21,0.18,0.7,0.22,0.39
203 | container,submenu,0.3595,0.4278,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44
204 | children,columns,0.2833,0.3201,0.7983,0.42,0.54,0.46,0.2,0.73,0.25,0.56
205 | items,tiles,0.3125,0.4116,NAN,0.28,0.55,0.35,0.27,0.79,0.4,0.6
206 | equal,ok,0.3195,0.3753,0.9052,0.7,0.8,0.67,0.65,0.76,0.0,0.2
207 | miny,ypos,0.375,0.6825,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5
208 | newLength,Group,0.0418,0.0267,0.1606,0.15,0.41,0.1,0.13,0.32,0.0,0.28
209 | colspan,assignTo,0.0625,0.0847,0.29,0.1,0.39,0.21,0.22,0.08,0.12,0.44
210 | setScrollTop,prefixWith,0.0795,0.1323,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42
211 | getMinutes,getUTCMinutes,0.725,0.9085,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77
212 | FONTDATA,FONTS,0.578,0.853,0.8931,0.92,0.92,0.95,0.54,0.75,0.5,0.56
213 | ReactEmptyComponent,renderToStaticMarkup,0.1073,0.091,0.3695,0.6,0.65,0.65,0.22,0.55,0.1,0.52
214 | images,streams,0.25,0.586,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5
215 | s1,s2,0.3552,0.7246,0.7833,0.97,0.96,0.99,0.56,0.89,0.5,0.75
216 | precondition,prereq,0.8125,0.8692,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38
217 | setAttributeNode,createAttribute,0.3678,0.7076,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62
218 | Connection,Client,0.2812,0.4608,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45
219 | dom__FixedSizeListIterator,_dom_pos,0.2655,0.5586,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21
220 | v2,v3,0.297,0.8039,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75
221 | trim,separatedList,0.163,0.2837,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23
222 | minVal,minValue,0.908,0.9587,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75
223 | btnModify,btnSetValue,0.6072,0.7385,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55
224 | nodeLinkFn,childLinkFn,0.75,0.8593,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73
225 | setMinutes,setSeconds,0.2205,0.9077,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7
226 | obj1,obj2,0.6095,0.8039,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88
227 | destroy,clear,0.6,0.8083,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43
228 | panelTitle,panelTitle1,0.8815,0.9587,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91
229 | __dependency1__,__dependency2__,0.4643,0.9378,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97
230 | ace,__ace_shadowed__,0.5,0.7945,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19
231 | m1,m2,0.5417,0.8112,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75
232 | latitude,longitude,0.0147,0.9385,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78
233 | item,record,0.6765,0.7694,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33
234 | cols,rects,0.173,0.4166,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5
235 | resetSize,popupFeatures,0.0418,0.1428,0.231,0.13,0.49,0.58,0.29,0.75,0.15,0.42
236 | parentWindow,scanner,0.0735,0.0463,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38
237 | popupToolbar,popupDependent,0.25,0.6077,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61
238 | cursor,intercept,0.15,0.2503,NAN,0.2,0.35,0.23,0.08,0.33,0.11,0.39
239 | split,shim,0.264,0.3316,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6
240 | ranges,codes,0.1818,0.3818,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58
241 | prereq,fnExists,0.1668,0.1109,NAN,0.34,0.76,0.91,0.67,0.01,0.0,0.38
242 | styleSelectLabel,cssClassInputLabel,0.5625,0.6077,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67
243 | ELEMENT_NODE,TEXT_NODE,0.5148,0.7999,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67
244 | rlocalProtocol,rprotocol,0.7083,0.8839,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61
245 | listeners,cbs,0.1922,0.0947,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22
246 | ranges,expressions,0.175,0.3724,NAN,0.38,0.46,0.45,0.17,0.78,0.18,0.36
247 | segments,annotations,0.2655,0.3135,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45
248 | onPlay,onPause,0.1618,0.8768,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64
249 | clientY,_keyStr,0.1168,0.1109,NAN,0.2,0.41,0.21,0.07,0.4,0.14,0.57
250 | oMatchesSelector,msMatchesSelector,0.6562,0.8692,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91
251 | re,destruct,0.0207,0.063,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19
252 | click,dblclick,0.206,0.9691,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62
253 | columns,requests,0.125,0.1663,0.5999,0.24,0.49,0.31,0.17,0.75,0.25,0.56
254 | a11,b12,0.1945,0.6804,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67
255 | pink,brown,0.0938,0.6893,0.2204,0.38,0.74,0.82,0.63,0.91,0.0,0.4
256 | floor,abs,0.1427,0.3462,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3
257 | math,miny,0.0177,0.3836,0.1581,0.47,0.63,0.44,0.27,0.35,0.25,0.62
258 | alignTop,popupToolbar,0.147,0.2001,NAN,0.39,0.47,0.52,0.57,NAN,0.17,0.42
259 | oDomRef,structuralType,0.0938,0.0194,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25
260 | dataAndEvents,deepDataAndEvents,0.7368,0.8624,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76
261 | toggle,isLength,0.0147,0.0,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44
262 | defaults,DomRange,0.1332,0.2155,NAN,0.11,0.34,0.16,-0.07,0.44,0.0,0.5
263 | advice,alternative,0.0695,0.1284,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41
264 | controller,mutator,0.3125,0.3789,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45
265 | files,locations,0.4465,0.4396,0.8585,0.25,0.46,0.3,-0.04,0.82,0.22,0.39
266 | click,mouseup,0.4305,0.8692,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36
267 | inputs,resources,0.4167,0.6339,0.7932,0.29,0.45,0.32,-0.03,0.78,0.22,0.44
268 | styleSelectLabel,inlineStyleInputLabel,0.5625,0.6569,NAN,0.72,0.74,0.91,0.24,NAN,0.48,0.62
269 | a12,a13,0.5,0.8133,NAN,0.96,0.97,0.96,0.5,0.98,0.67,0.83
270 | abs,sqrt,0.0177,0.6825,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38
271 | cells,segments,0.5938,0.6893,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44
272 | place,written,0.047,0.1009,0.3411,0.15,0.41,0.2,0.11,0.42,0.14,0.43
273 | on,immediate,0.175,0.1893,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11
274 | links,associations,0.6667,0.8039,NAN,0.33,0.42,0.42,-0.01,0.72,0.17,0.29
275 | names,sources,0.578,0.6077,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5
276 | minx,ymax,0.1168,0.6862,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62
277 | cols,domains,0.25,0.4294,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43
278 | ELEMENT_ARRAY_BUFFER,bufferData,0.5312,0.8366,0.875,0.51,0.75,0.9,0.52,0.55,0.0,0.25
279 | equals,same,0.8333,0.9346,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42
280 | nextTick,notification,0.1345,0.2356,NAN,0.32,0.43,0.39,0.02,0.33,0.25,0.46
281 | cfg,conf,0.9445,0.8546,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5
282 | wrapper,scroller,0.3845,0.477,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62
283 | userA,userB,0.4605,0.9174,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9
284 | columns,datasets,0.1875,0.4278,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5
285 | limit,exponent,0.1537,0.3562,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38
286 | onTouchStart,onTouchMove,0.4545,0.7861,0.9397,0.88,0.92,0.95,0.33,0.96,0.58,0.75
287 | sender,timeEnd,0.0625,0.1064,NAN,0.12,0.45,0.34,-0.11,0.43,0.14,0.43
288 | scaleX,scaleY,0.1668,0.9783,NAN,0.97,0.97,0.99,0.61,0.95,0.83,0.92
289 | ui,secret,0.0735,0.0463,0.3489,0.05,0.5,0.09,0.33,0.54,0.0,0.17
290 | builtinConstants,buildinConstants,0.6315,0.8073,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97
291 | objects,images,NAN,NAN,0.8986,0.33,0.58,0.44,-0.21,0.73,0.14,0.5
292 | items,images,NAN,NAN,0.8436,0.39,0.6,0.4,-0.09,0.83,0.33,0.58
293 | items,links,NAN,NAN,0.7983,0.39,0.57,0.52,-0.06,0.75,0.2,0.6
294 | canvas,image,NAN,NAN,0.9397,0.47,0.68,0.6,0.59,0.71,0.0,0.42
295 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56
296 | margin,padding,NAN,NAN,0.9369,0.62,0.74,0.78,0.62,0.87,0.43,0.64
297 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64
298 |
--------------------------------------------------------------------------------
/varclr/benchmarks/idbench/medium_pair_wise.csv:
--------------------------------------------------------------------------------
1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW
2 | i,targ,0.341,0.1963,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12
3 | canvas,video,0.3638,0.6927,NAN,0.48,0.57,0.49,0.24,0.78,0.0,0.42
4 | idx,indx,0.9318,0.929,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75
5 | idx,ridx,0.4317,0.669,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75
6 | right,bottom,0.0832,0.7574,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42
7 | count,total,0.8125,0.8211,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6
8 | click,mousedown,0.7812,0.935,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28
9 | change,keyup,0.3542,0.5016,NAN,0.55,0.65,0.54,0.55,0.81,0.0,0.42
10 | change,submit,0.3542,0.48,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5
11 | files,players,0.125,0.1456,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5
12 | focus,resize,0.1457,0.3716,NAN,0.3,0.62,0.6,0.38,0.8,0.0,0.42
13 | reset,refresh,0.6178,0.8011,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57
14 | pushStackLiteral,oldSelection,0.0385,0.1599,0.231,0.13,0.27,0.32,0.31,0.55,0.12,0.44
15 | onAdd,onRemove,0.0207,0.9566,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44
16 | black,colours,0.0832,0.87,0.242,0.42,0.74,0.72,-0.12,0.53,0.14,0.43
17 | cosφ0,cosφ,0.591,0.8817,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8
18 | allocate,contextmenu,0.1,0.142,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45
19 | response,alert,0.2082,0.5884,NAN,0.33,0.53,0.46,0.14,0.31,0.0,0.31
20 | filename,fullname,0.15,0.246,NAN,0.65,0.65,0.7,0.11,0.72,0.75,0.88
21 | ln,ilen,0.6923,0.76,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5
22 | tasks,todos,0.9823,0.9444,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7
23 | images,authors,0.0167,0.3066,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5
24 | editable,dropdown,0.0578,0.3201,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5
25 | sources,adapters,0.4,0.532,NAN,0.34,0.46,0.3,0.23,0.69,0.25,0.56
26 | ReactDOMComponent,ReactTextComponent,0.5192,0.72,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86
27 | λ0,φ0,0.175,0.61,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75
28 | xMin,xMax,0.0227,0.9763,NAN,0.9,0.95,0.97,0.38,0.9,0.5,0.75
29 | FunctionExpression,FunctionDeclaration,0.3125,0.8484,NAN,0.88,0.82,0.9,0.37,0.96,0.58,0.76
30 | Lines,CurRange,0.325,0.428,NAN,0.58,0.68,0.87,0.46,0.39,0.12,0.38
31 | foundMap,foundStarMap,0.7083,0.8266,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67
32 | columns,cols,0.9667,0.9654,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57
33 | dm,_queueHooks,0.1138,0.149,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09
34 | fuchsia,pink,0.75,0.9134,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29
35 | maxLine,maxLineLength,0.5833,0.805,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54
36 | ExpressionStatement,FunctionDeclaration,0.3055,0.6534,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53
37 | addCls,removeCls,0.159,0.8346,NAN,0.85,0.92,0.96,0.71,0.96,0.33,0.5
38 | object2,overlapOnly,0.0667,0.0294,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36
39 | _selection,_sel,0.9772,0.9763,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4
40 | alignCenter,alignMiddle,0.659,0.8346,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73
41 | alignTop,popupLocationBar,0.125,0.2943,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34
42 | targetFrame,targetFrameName,0.8638,0.9054,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73
43 | angle,radians,0.6,0.922,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43
44 | miny,ymin,1.0,0.811,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62
45 | equal,eql,0.9823,0.9815,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6
46 | item,entry,0.8333,0.792,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5
47 | events,rchecked,0.1362,0.149,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44
48 | image,polyline,0.2205,0.3729,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38
49 | img,thumb,0.7,0.8094,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4
50 | player,peer,0.4545,0.669,NAN,0.53,0.64,0.51,0.23,0.68,0.5,0.58
51 | files,profiles,0.159,0.2436,NAN,0.58,0.6,0.42,-0.01,0.77,0.62,0.62
52 | reset,clear,0.8845,0.8999,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5
53 | username,userid,0.6,0.9654,0.8685,0.72,0.79,0.67,0.15,0.65,0.5,0.62
54 | clear,refresh,0.45,0.7054,NAN,0.55,0.66,0.61,0.35,0.81,0.14,0.43
55 | disabled,Tracker,0.025,0.194,NAN,0.16,0.36,0.19,0.04,0.28,0.25,0.56
56 | olive,darkred,0.0715,0.61,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43
57 | selectAnchor,anchorName,0.3125,0.675,NAN,0.6,0.73,0.77,0.43,0.99,0.08,0.46
58 | names,filenames,0.625,0.7956,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56
59 | setInterval,clearInterval,0.0715,0.9072,0.3411,0.89,0.9,0.96,0.53,0.9,0.69,0.77
60 | getRules,foldingRules,0.3845,0.6799,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54
61 | self_msgs,they_effects,0.0682,0.0546,0.2684,0.5,0.83,0.9,0.3,0.89,0.25,0.5
62 | getInstanceProp,setInstanceProp,0.0,0.9199,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97
63 | emptyText,blankText,0.8927,0.8885,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72
64 | minText,maxText,0.0,0.9628,0.1279,0.95,0.9,0.93,0.65,0.99,0.71,0.86
65 | maxText,disabledDaysText,0.0,0.0684,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38
66 | disabledDaysText,disabledDatesText,0.3638,0.8817,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91
67 | keywordMapper,buildinConstants,0.0625,0.09,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41
68 | VM,invokePartial,0.0333,0.1334,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08
69 | blendMode,currentBlendMode,0.75,0.9784,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53
70 | touchmove,touchend,0.2167,0.844,NAN,0.91,0.94,0.96,0.27,0.97,0.56,0.72
71 | bindBuffer,ARRAY_BUFFER,0.2812,0.6425,0.625,0.51,0.81,0.96,0.72,0.65,0.08,0.46
72 | traverseContext,mapResult,0.4,0.48,NAN,0.36,0.58,0.84,0.25,0.68,0.2,0.4
73 | _owner,nextElement,0.0227,0.0546,0.1838,0.28,0.59,0.84,0.34,0.36,0.09,0.32
74 | m21,m22,0.159,0.7637,NAN,0.98,0.96,0.95,0.48,0.94,0.67,0.83
75 | child,face,0.1073,0.3872,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4
76 | displayMsg,emptyMsg,0.1875,0.6966,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65
77 | pseudoElements,pseudoClasses,0.4545,0.811,NAN,0.71,0.75,0.78,0.74,0.99,0.57,0.75
78 | lastName,firstName,0.0555,0.8989,0.3464,0.9,0.94,0.97,0.41,0.9,0.67,0.78
79 | Int16Array,Uint16Array,0.4705,0.9542,NAN,0.91,0.91,0.94,0.18,0.83,0.82,0.86
80 | startSymbol,endSymbol,0.044,0.9847,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68
81 | decrypt,ciphertext,0.3685,0.7949,NAN,0.63,0.77,0.86,0.19,0.58,0.2,0.45
82 | rlocalProtocol,rurl,0.3027,0.5621,NAN,0.66,0.6,0.67,0.47,0.91,0.21,0.25
83 | hSpace,popupFeatures,0.0155,0.1225,0.0,0.25,0.48,0.6,0.49,0.86,0.23,0.35
84 | linkTab,alignMiddle,0.0892,0.1643,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45
85 | lockRatio,alignRight,0.0192,0.1599,0.231,0.47,0.45,0.57,0.13,0.74,0.2,0.55
86 | substr,substring,1.0,0.9306,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67
87 | columns,menus,0.3862,0.551,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5
88 | history,$ERROR,0.0207,0.1984,NAN,0.16,0.33,0.14,0.14,NAN,0.0,0.43
89 | deltaX,deltaY,0.2857,0.9072,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92
90 | MINUTE,SECOND,0.125,0.9566,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5
91 | onDragStart,onDragEnd,0.1345,0.98,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68
92 | body,agg,0.0832,0.1115,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38
93 | rows,pages,0.1668,0.5234,NAN,0.41,0.62,0.49,0.31,0.81,0.2,0.5
94 | store,storage,0.8333,0.9306,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71
95 | angle,theta,0.75,0.811,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5
96 | foo,bar,0.5208,0.6316,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5
97 | DATE,MONTH,0.225,1.0,0.5943,0.65,0.75,0.83,0.34,0.89,0.2,0.5
98 | modal,calendar,0.0207,0.025,NAN,0.27,0.44,0.33,0.16,0.6,0.25,0.44
99 | ids,tasks,0.325,0.48,NAN,0.38,0.58,0.31,0.26,0.8,0.2,0.4
100 | orange,pink,0.159,0.929,NAN,0.21,0.73,0.75,0.49,0.88,0.17,0.42
101 | CallExpression,BlockStatement,0.2045,0.409,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5
102 | data,azimuthal,0.173,0.1599,0.401,0.28,0.45,0.14,0.02,0.44,0.22,0.33
103 | raw,movie,0.1,0.3066,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3
104 | expect,bp,0.05,0.0294,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25
105 | utils,util,0.75,0.9763,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8
106 | items,files,0.6042,0.5884,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6
107 | disabled,visible,0.125,0.6284,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75
108 | round,sqrt,0.0418,0.545,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4
109 | teal,lightgrey,0.25,0.87,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28
110 | navy,lightblue,0.4167,0.9306,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22
111 | navy,lightgreen,0.125,0.74,NAN,0.31,0.75,0.89,0.47,0.94,0.0,0.2
112 | navy,magenta,0.2115,0.8401,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36
113 | equal,cut,0.0455,0.1253,0.2049,0.09,0.51,0.09,0.15,0.34,0.2,0.4
114 | start,begin,0.9375,0.9784,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5
115 | renderer,screen,0.475,0.662,NAN,0.28,0.46,0.35,0.19,0.64,0.25,0.5
116 | visible,showing,0.8382,0.9082,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5
117 | foo,trow,0.2082,0.0684,NAN,0.22,0.62,-0.02,0.17,0.35,0.25,0.5
118 | arrayClass,boolClass,0.1965,0.6472,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7
119 | me,br,0.159,0.1017,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5
120 | items,records,0.725,0.844,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43
121 | items,ranges,0.2045,0.3383,NAN,0.24,0.55,0.38,0.18,0.78,0.17,0.5
122 | bindBuffer,ELEMENT_ARRAY_BUFFER,0.25,0.48,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28
123 | dispatchIDs,_dispatchIDs,0.8215,0.9628,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92
124 | maroon,lightblue,0.0227,0.7163,0.1838,0.45,0.77,0.78,0.37,0.88,0.0,0.33
125 | remove,focus,0.0578,0.0601,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42
126 | getAnimation,depot,0.0385,0.04,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29
127 | nodes,filenames,0.2333,0.4974,NAN,0.39,0.46,0.39,-0.08,0.63,0.33,0.44
128 | cx,sx,0.3832,0.4106,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75
129 | records,entries,0.9062,0.8861,1.0,0.41,0.53,0.52,0.14,0.76,0.29,0.57
130 | adjusted_scale,rangy,0.3333,0.48,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18
131 | PLACEHOLDER,vertexFormat,0.075,0.09,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46
132 | vSpace,advisoryTitleInputLabel,0.0832,0.0814,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22
133 | λ0,λ1,0.2,0.74,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75
134 | start,searches,0.1155,0.1399,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5
135 | navy,purple,0.225,0.688,NAN,0.37,0.84,0.9,0.42,0.95,0.0,0.33
136 | olive,pink,0.0625,0.5234,NAN,0.31,0.78,0.87,0.54,0.88,0.2,0.5
137 | push,configure,0.0625,0.1334,NAN,0.06,0.49,0.17,0.14,0.55,0.11,0.28
138 | len,ln,0.8125,0.8539,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67
139 | left,top,0.05,0.766,NAN,0.8,0.88,0.91,0.81,0.98,0.0,0.38
140 | self,that,0.3035,0.4615,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5
141 | y,z,0.4423,0.5801,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5
142 | element,elm,0.8655,0.9399,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43
143 | id,userid,0.7885,0.8401,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33
144 | id,sessionid,0.7917,0.935,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22
145 | equal,ok,0.3333,0.3934,NAN,0.7,0.8,0.67,0.65,0.76,0.0,0.2
146 | container,video,0.2293,0.415,NAN,0.29,0.5,0.21,0.18,0.7,0.22,0.39
147 | miny,ypos,0.2955,0.6927,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5
148 | newLength,Group,0.0295,0.0211,0.1606,0.15,0.41,0.1,0.13,0.32,0.0,0.28
149 | colspan,assignTo,0.0555,0.0754,0.29,0.1,0.39,0.21,0.22,0.08,0.12,0.44
150 | setScrollTop,prefixWith,0.0832,0.1456,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42
151 | getMinutes,getUTCMinutes,0.75,0.9277,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77
152 | FONTDATA,FONTS,0.5667,0.8614,0.8931,0.92,0.92,0.95,0.54,0.75,0.5,0.56
153 | ReactEmptyComponent,renderToStaticMarkup,0.0973,0.09,NAN,0.6,0.65,0.65,0.22,0.55,0.1,0.52
154 | images,streams,0.25,0.636,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5
155 | VERSION,geoJson,0.075,0.09,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5
156 | s1,s2,0.422,0.74,0.7833,0.97,0.96,0.99,0.56,0.89,0.5,0.75
157 | precondition,prereq,0.85,0.9134,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38
158 | setAttributeNode,createAttribute,0.4,0.7746,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62
159 | Connection,Client,0.25,0.4399,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45
160 | dom__FixedSizeListIterator,_dom_pos,0.3077,0.5801,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21
161 | v2,v3,0.3077,0.8001,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75
162 | container,submenu,0.3333,0.4366,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44
163 | children,columns,0.341,0.409,NAN,0.42,0.54,0.46,0.2,0.73,0.25,0.56
164 | items,tiles,0.3542,0.4584,NAN,0.28,0.55,0.35,0.27,0.79,0.4,0.6
165 | trim,separatedList,0.1912,0.3271,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23
166 | minVal,minValue,0.95,0.948,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75
167 | push,ts,0.1043,0.1115,NAN,0.12,0.56,0.29,0.01,0.24,0.25,0.38
168 | expect,assume,0.7708,0.805,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5
169 | btnModify,btnSetValue,0.5962,0.72,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55
170 | nodeLinkFn,childLinkFn,0.75,0.8601,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73
171 | setMinutes,setSeconds,0.2167,0.896,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7
172 | obj1,obj2,0.65,0.792,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88
173 | destroy,clear,0.6138,0.7873,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43
174 | panelTitle,panelTitle1,0.9167,0.9654,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91
175 | __dependency1__,__dependency2__,0.4605,0.9316,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97
176 | ace,__ace_shadowed__,0.5,0.818,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19
177 | m1,m2,0.5735,0.8625,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75
178 | latitude,longitude,0.0167,0.9306,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78
179 | item,record,0.6667,0.7746,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33
180 | cols,rects,0.2045,0.4563,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5
181 | resetSize,popupFeatures,0.05,0.1854,0.231,0.13,0.49,0.58,0.29,0.75,0.15,0.42
182 | callback,cb,0.8655,0.8601,NAN,0.65,0.7,0.78,0.48,0.83,0.25,0.25
183 | parentWindow,scanner,0.0715,0.0343,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38
184 | targetPopup,popupResizable,0.2885,0.5801,NAN,0.51,0.63,0.85,0.45,NAN,0.07,0.43
185 | popupToolbar,popupDependent,0.1785,0.5915,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61
186 | cursor,intercept,0.05,0.22,NAN,0.2,0.35,0.23,0.08,0.33,0.11,0.39
187 | split,shim,0.2917,0.3934,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6
188 | ranges,codes,0.2,0.428,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58
189 | generalTab,advancedTab,0.375,0.87,0.0,0.63,0.76,0.91,0.77,0.98,0.27,0.59
190 | styleSelectLabel,cssClassInputLabel,0.6138,0.669,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67
191 | getBorderWidth,getPadding,0.4822,0.8328,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54
192 | ELEMENT_NODE,TEXT_NODE,0.5,0.805,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67
193 | rlocalProtocol,rprotocol,0.7,0.896,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61
194 | listeners,cbs,0.125,0.142,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22
195 | tr,td,0.0455,0.811,NAN,0.81,0.82,0.8,0.56,0.77,0.5,0.75
196 | user,person,0.8845,0.8401,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42
197 | onPlay,onPause,0.172,0.87,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64
198 | topLevelTarget,topLevelTargetID,0.625,0.87,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88
199 | clientY,_keyStr,0.0682,0.1017,NAN,0.2,0.41,0.21,0.07,0.4,0.14,0.57
200 | oMatchesSelector,msMatchesSelector,0.6833,0.8614,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91
201 | re,destruct,0.0207,0.0684,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19
202 | translateX,translateY,0.2045,0.9527,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95
203 | click,dblclick,0.2333,0.9654,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62
204 | columns,requests,0.1537,0.1599,NAN,0.24,0.49,0.31,0.17,0.75,0.25,0.56
205 | a11,b12,0.2142,0.6284,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67
206 | pink,brown,0.0358,0.7215,0.2204,0.38,0.74,0.82,0.63,0.91,0.0,0.4
207 | floor,abs,0.1138,0.4563,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3
208 | math,miny,0.0192,0.3599,0.1581,0.47,0.63,0.44,0.27,0.35,0.25,0.62
209 | toggle,isLength,0.0147,0.0057,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44
210 | paddingRight,paddingTop,0.05,1.0,NAN,0.89,0.88,0.88,0.28,0.94,0.58,0.71
211 | dataAndEvents,deepDataAndEvents,0.6875,0.8375,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76
212 | found,rawFunc,0.0385,0.0998,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5
213 | alpha,rate,0.125,0.22,NAN,0.37,0.59,0.49,0.27,0.71,0.0,0.4
214 | layers,entries,0.2955,0.4327,NAN,0.38,0.5,0.44,0.22,0.67,0.14,0.5
215 | keyup,deferId,0.1345,0.1399,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43
216 | material,light,0.0832,0.3414,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31
217 | controller,mutator,0.3125,0.3716,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45
218 | files,locations,0.4423,0.48,0.8585,0.25,0.46,0.3,-0.04,0.82,0.22,0.39
219 | files,problems,0.0625,0.0738,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5
220 | frames,markers,0.2188,0.5611,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57
221 | objects,shortcuts,0.1168,0.1505,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56
222 | a01,b11,0.1965,0.48,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67
223 | defHeaders,defHeaderName,0.5208,0.9566,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73
224 | click,mouseup,0.5168,0.9134,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36
225 | inputs,resources,0.4107,0.6284,0.7932,0.29,0.45,0.32,-0.03,0.78,0.22,0.44
226 | advice,alternative,0.0715,0.1271,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41
227 | body,currants,0.125,0.22,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25
228 | a12,a13,0.4167,0.7616,NAN,0.96,0.97,0.96,0.5,0.98,0.67,0.83
229 | abs,sqrt,0.0207,0.74,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38
230 | cells,segments,0.5893,0.7028,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44
231 | place,written,0.0227,0.1253,NAN,0.15,0.41,0.2,0.11,0.42,0.14,0.43
232 | names,sources,0.577,0.6001,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5
233 | Util,isParam,0.075,0.168,NAN,0.27,0.52,0.29,0.35,0.4,0.0,0.29
234 | minx,ymax,0.125,0.7028,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62
235 | segments,annotations,0.25,0.3128,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45
236 | cols,domains,0.25,0.402,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43
237 | ELEMENT_ARRAY_BUFFER,bufferData,0.4772,0.8583,0.875,0.51,0.75,0.9,0.52,0.55,0.0,0.25
238 | cfg,conf,0.9667,0.896,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5
239 | userA,userB,0.4822,0.9256,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9
240 | columns,datasets,0.1922,0.4199,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5
241 | limit,exponent,0.1818,0.3383,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38
242 | oDomRef,structuralType,0.077,0.0,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25
243 | camera,texture,0.0578,0.1999,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5
244 | selected,active,0.7322,0.87,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44
245 | rows,columns,0.0715,0.87,0.2155,0.72,0.78,0.81,0.34,0.86,0.29,0.43
246 | nextTick,notification,0.175,0.298,NAN,0.32,0.43,0.39,0.02,0.33,0.25,0.46
247 | ui,secret,0.0625,0.0684,NAN,0.05,0.5,0.09,0.33,0.54,0.0,0.17
248 | objects,records,NAN,NAN,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57
249 | objects,images,NAN,NAN,0.8986,0.33,0.58,0.44,-0.21,0.73,0.14,0.5
250 | items,images,NAN,NAN,0.8436,0.39,0.6,0.4,-0.09,0.83,0.33,0.58
251 | canvas,image,NAN,NAN,0.9397,0.47,0.68,0.6,0.59,0.71,0.0,0.42
252 | indices,positions,NAN,NAN,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5
253 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56
254 | margin,padding,NAN,NAN,0.9369,0.62,0.74,0.78,0.62,0.87,0.43,0.64
255 | $behaviour,foldingRules,NAN,NAN,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46
256 | foo,abc,NAN,NAN,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5
257 | g,r,NAN,NAN,0.8674,0.53,0.84,0.75,0.68,0.71,0.0,0.5
258 | err,er,NAN,NAN,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67
259 | res,resp,NAN,NAN,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75
260 | gray,silver,NAN,NAN,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33
261 | get,facets,NAN,NAN,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42
262 | files,images,NAN,NAN,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58
263 | b01,a03,NAN,NAN,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67
264 | styleSelectLabel,tag_h4,NAN,NAN,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22
265 | a24,a30,NAN,NAN,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67
266 | lightgreen,lightgrey,NAN,NAN,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85
267 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64
268 | on,immediate,NAN,NAN,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11
269 | equals,same,NAN,NAN,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42
270 | wrapper,scroller,NAN,NAN,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62
271 | builtinConstants,buildinConstants,NAN,NAN,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97
272 |
--------------------------------------------------------------------------------
/varclr/benchmarks/idbench/small_pair_wise.csv:
--------------------------------------------------------------------------------
1 | id1,id2,similarity,relatedness,contextual_similarity,FT-cbow,FT-SG,w2v-SG,w2v-cbow,Path-based,LV,NW
2 | response,alert,0.25,0.6348,NAN,0.33,0.53,0.46,0.14,0.31,0.0,0.31
3 | ln,ilen,0.7728,0.7866,0.8591,0.44,0.64,0.45,0.51,0.89,0.5,0.5
4 | tasks,todos,0.9772,0.9288,NAN,0.5,0.6,0.49,0.32,0.79,0.4,0.7
5 | images,authors,0.0207,0.3042,NAN,0.33,0.54,0.41,0.15,0.73,0.14,0.5
6 | editable,dropdown,0.0625,0.3261,NAN,0.3,0.51,0.48,0.23,0.73,0.0,0.5
7 | ReactDOMComponent,ReactTextComponent,0.4772,0.7154,0.8138,0.94,0.86,0.78,0.28,0.82,0.78,0.86
8 | foundMap,foundStarMap,0.7045,0.8341,NAN,0.91,0.86,0.92,0.46,0.89,0.67,0.67
9 | columns,cols,0.9667,0.9653,NAN,0.78,0.81,0.78,0.31,0.83,0.57,0.57
10 | dm,_queueHooks,0.1138,0.1461,NAN,0.0,0.34,0.18,0.17,0.3,0.0,0.09
11 | pushStackLiteral,oldSelection,0.05,0.113,NAN,0.13,0.27,0.32,0.31,0.55,0.12,0.44
12 | fuchsia,pink,0.775,0.9478,0.9877,0.28,0.77,0.92,0.48,0.95,0.0,0.29
13 | ExpressionStatement,FunctionDeclaration,0.2,0.6173,NAN,0.66,0.67,0.81,0.46,0.92,0.11,0.53
14 | object2,overlapOnly,0.077,0.0368,NAN,0.44,0.67,0.74,0.54,0.46,0.09,0.36
15 | _selection,_sel,0.9772,0.9763,NAN,0.81,0.82,0.69,0.43,0.45,0.4,0.4
16 | alignTop,popupLocationBar,0.1457,0.3042,NAN,0.39,0.49,0.51,0.59,NAN,0.19,0.34
17 | miny,ymin,1,0.7913,NAN,0.48,0.81,0.7,0.24,0.9,0.5,0.62
18 | equal,eql,1,1,0.983,0.7,0.79,0.73,0.73,0.72,0.6,0.6
19 | item,entry,0.775,0.7391,0.9186,0.4,0.63,0.46,0.13,0.84,0.2,0.5
20 | image,polyline,0.225,0.4521,0.0247,0.2,0.44,0.31,-0.0,0.75,0.12,0.38
21 | player,peer,0.4,0.6608,NAN,0.53,0.64,0.51,0.23,0.68,0.5,0.58
22 | files,profiles,0.175,0.2695,NAN,0.58,0.6,0.42,-0.01,0.77,0.62,0.62
23 | reset,clear,0.95,0.9478,0.945,0.52,0.69,0.63,0.34,0.93,0.0,0.5
24 | reset,refresh,0.6332,0.7913,0.9822,0.45,0.63,0.62,0.28,0.86,0.43,0.57
25 | username,userid,0.577,0.9799,NAN,0.72,0.79,0.67,0.15,0.65,0.5,0.62
26 | clear,refresh,0.4375,0.6955,NAN,0.55,0.66,0.61,0.35,0.81,0.14,0.43
27 | olive,darkred,0.077,0.5787,NAN,0.45,0.69,0.75,0.24,0.88,0.14,0.43
28 | selectAnchor,anchorName,0.35,0.7391,NAN,0.6,0.73,0.77,0.43,0.99,0.08,0.46
29 | names,filenames,0.625,0.7827,0.7983,0.62,0.64,0.5,0.1,0.78,0.56,0.56
30 | setInterval,clearInterval,0.0625,0.8912,NAN,0.89,0.9,0.96,0.53,0.9,0.69,0.77
31 | getRules,foldingRules,0.35,0.6608,0.0831,0.7,0.76,0.66,0.43,0.66,0.42,0.54
32 | self_msgs,they_effects,0.05,0.0347,NAN,0.5,0.83,0.9,0.3,0.89,0.25,0.5
33 | getInstanceProp,setInstanceProp,0,0.9478,0.0975,0.94,0.96,0.86,0.6,0.72,0.93,0.97
34 | emptyText,blankText,0.8958,0.9564,0.966,0.67,0.73,0.69,0.11,0.92,0.44,0.72
35 | minText,maxText,0,0.9564,NAN,0.95,0.9,0.93,0.65,0.99,0.71,0.86
36 | maxText,disabledDaysText,0,0.0277,NAN,0.66,0.72,0.87,0.51,0.93,0.31,0.38
37 | disabledDaysText,disabledDatesText,0.375,0.8956,0.6876,0.96,0.96,0.95,0.64,0.99,0.88,0.91
38 | keywordMapper,buildinConstants,0.05,0.0347,NAN,0.41,0.65,0.75,0.48,0.77,0.06,0.41
39 | VM,invokePartial,0.0358,0.1427,NAN,0.13,0.55,0.83,0.25,0.77,0.0,0.08
40 | blendMode,currentBlendMode,0.75,0.9739,0.9186,0.79,0.88,0.78,0.69,0.79,0.5,0.53
41 | touchmove,touchend,0.2323,0.8696,NAN,0.91,0.94,0.96,0.27,0.97,0.56,0.72
42 | bindBuffer,ARRAY_BUFFER,0.2833,0.6348,NAN,0.51,0.81,0.96,0.72,0.65,0.08,0.46
43 | click,mousedown,0.7857,0.9254,0.3079,0.57,0.69,0.76,0.52,0.91,0.0,0.28
44 | files,players,0.159,0.1698,NAN,0.26,0.51,0.32,0.01,0.8,0.29,0.5
45 | _owner,nextElement,0.025,0.0347,NAN,0.28,0.59,0.84,0.34,0.36,0.09,0.32
46 | child,face,0.125,0.3694,NAN,0.12,0.46,0.32,0.14,0.5,0.0,0.4
47 | displayMsg,emptyMsg,0.2045,0.6679,NAN,0.47,0.8,0.79,0.24,0.88,0.5,0.65
48 | pseudoElements,pseudoClasses,0.475,0.7913,NAN,0.71,0.75,0.78,0.74,0.99,0.57,0.75
49 | lastName,firstName,0.0625,0.9022,NAN,0.9,0.94,0.97,0.41,0.9,0.67,0.78
50 | Int16Array,Uint16Array,0.453,0.951,NAN,0.91,0.91,0.94,0.18,0.83,0.82,0.86
51 | startSymbol,endSymbol,0.047,0.9838,NAN,0.94,0.91,0.97,0.6,0.98,0.55,0.68
52 | decrypt,ciphertext,0.3595,0.8369,NAN,0.63,0.77,0.86,0.19,0.58,0.2,0.45
53 | rlocalProtocol,rurl,0.3235,0.5857,NAN,0.66,0.6,0.67,0.47,0.91,0.21,0.25
54 | hSpace,popupFeatures,0.0177,0.1427,0,0.25,0.48,0.6,0.49,0.86,0.23,0.35
55 | linkTab,alignMiddle,0.0625,0.0869,NAN,0.4,0.46,0.62,0.12,NAN,0.27,0.45
56 | lockRatio,alignRight,0,0.1391,NAN,0.47,0.45,0.57,0.13,0.74,0.2,0.55
57 | substr,substring,1,0.9303,0.8889,0.91,0.86,0.9,0.64,0.97,0.67,0.67
58 | change,submit,0.4,0.426,NAN,0.59,0.67,0.52,0.33,0.82,0.0,0.5
59 | columns,menus,0.4,0.5565,0.0404,0.35,0.5,0.4,0.18,0.52,0.29,0.5
60 | deltaX,deltaY,0.2,0.8956,NAN,0.98,0.96,0.99,0.64,0.95,0.83,0.92
61 | MINUTE,SECOND,0.125,0.9564,NAN,0.77,0.85,0.95,0.29,0.99,0.0,0.5
62 | onDragStart,onDragEnd,0.1457,0.9783,NAN,0.95,0.93,0.88,0.27,0.96,0.55,0.68
63 | expect,bp,0.0578,0.0368,0.1062,0.05,0.48,0.22,0.05,0.07,0.17,0.25
64 | items,files,0.575,0.5826,0.7505,0.3,0.59,0.38,0.28,0.77,0.4,0.6
65 | disabled,visible,0.175,0.713,NAN,0.36,0.51,0.52,0.42,0.74,0.62,0.75
66 | round,sqrt,0.0227,0.5732,NAN,0.6,0.78,0.77,0.85,0.76,0.0,0.4
67 | teal,lightgrey,0.25,0.8696,NAN,0.25,0.7,0.87,0.39,NAN,0.11,0.28
68 | navy,lightblue,0.3638,0.9525,0.231,0.38,0.76,0.87,0.35,0.9,0.0,0.22
69 | navy,magenta,0.25,0.905,NAN,0.38,0.74,0.74,0.38,0.87,0.14,0.36
70 | equal,cut,0.0455,0.1224,NAN,0.09,0.51,0.09,0.15,0.34,0.2,0.4
71 | start,begin,0.925,0.9739,NAN,0.58,0.67,0.48,0.69,0.85,0.0,0.5
72 | visible,showing,0.85,0.9303,NAN,0.55,0.7,0.54,-0.15,0.66,0.0,0.5
73 | arrayClass,boolClass,0.1922,0.6387,NAN,0.65,0.76,0.84,0.31,0.97,0.5,0.7
74 | me,br,0.159,0.0986,NAN,0.06,0.62,0.16,0.35,0.46,0.0,0.5
75 | dispatchIDs,_dispatchIDs,0.7955,0.9525,NAN,0.78,0.88,0.78,0.5,0.43,0.92,0.92
76 | getAnimation,depot,0.0227,0.0277,NAN,0.19,0.57,0.78,0.44,0.59,0.17,0.29
77 | cx,sx,0.4107,0.4409,NAN,0.45,0.77,0.61,0.52,0.68,0.5,0.75
78 | records,entries,0.9062,0.8857,1,0.41,0.53,0.52,0.14,0.76,0.29,0.57
79 | adjusted_scale,rangy,0.3333,0.4782,0.242,0.27,0.39,0.27,0.35,NAN,0.0,0.18
80 | vSpace,advisoryTitleInputLabel,0.0578,0.0569,0.0478,0.38,0.47,0.53,0.38,NAN,0.17,0.22
81 | start,searches,0.1043,0.1521,NAN,0.21,0.47,0.27,-0.16,0.45,0.38,0.5
82 | len,ln,0.85,0.8956,0.8428,0.66,0.75,0.72,0.6,0.85,0.67,0.67
83 | self,that,0.175,0.3999,0.4594,0.28,0.7,0.62,0.1,0.88,0.0,0.5
84 | y,z,0.475,0.6608,NAN,0.58,0.86,0.73,0.58,0.84,0.0,0.5
85 | element,elm,0.8862,0.9525,NAN,0.58,0.73,0.6,0.37,0.84,0.43,0.43
86 | id,userid,0.8182,0.8578,0.8708,0.56,0.67,0.45,-0.02,0.77,0.33,0.33
87 | miny,ypos,0.2955,0.6916,0.0247,0.27,0.7,0.6,0.4,0.75,0.0,0.5
88 | newLength,Group,0.0295,0.0177,NAN,0.15,0.41,0.1,0.13,0.32,0.0,0.28
89 | colspan,assignTo,0.0588,0.0793,NAN,0.1,0.39,0.21,0.22,0.08,0.12,0.44
90 | setScrollTop,prefixWith,0.0973,0.1594,NAN,-0.15,0.22,0.03,0.32,0.84,0.0,0.42
91 | getMinutes,getUTCMinutes,0.797,0.9348,0.9794,0.93,0.87,0.87,0.28,0.95,0.77,0.77
92 | FONTDATA,FONTS,0.5833,0.8912,NAN,0.92,0.92,0.95,0.54,0.75,0.5,0.56
93 | ReactEmptyComponent,renderToStaticMarkup,0.0735,0.0485,NAN,0.6,0.65,0.65,0.22,0.55,0.1,0.52
94 | VERSION,geoJson,0.0938,0.1195,0.1606,0.16,0.32,0.09,0.25,0.48,0.0,0.5
95 | s1,s2,0.4615,0.7592,NAN,0.97,0.96,0.99,0.56,0.89,0.5,0.75
96 | precondition,prereq,0.8393,0.9254,0.8173,0.57,0.75,0.88,0.71,0.86,0.25,0.38
97 | setAttributeNode,createAttribute,0.3928,0.7764,NAN,0.69,0.74,0.94,0.43,0.75,0.5,0.62
98 | Connection,Client,0.2082,0.413,NAN,0.74,0.74,0.87,0.36,0.81,0.3,0.45
99 | dom__FixedSizeListIterator,_dom_pos,0.225,0.5043,NAN,0.46,0.64,0.86,0.91,NAN,0.15,0.21
100 | v2,v3,0.2917,0.8043,NAN,0.85,0.9,0.92,0.5,0.91,0.5,0.75
101 | onAdd,onRemove,0.025,0.9739,NAN,0.81,0.86,0.91,0.46,0.94,0.25,0.44
102 | destroy,clear,0.675,0.8435,0.8296,0.55,0.69,0.63,0.55,0.93,0.14,0.43
103 | panelTitle,panelTitle1,0.923,0.9598,0.625,0.92,0.96,0.86,0.69,NAN,0.91,0.91
104 | __dependency1__,__dependency2__,0.4375,0.9186,NAN,1.0,0.98,0.96,0.65,0.82,0.93,0.97
105 | minVal,minValue,0.9583,0.9564,0.6913,0.84,0.89,0.65,-0.2,0.77,0.75,0.75
106 | ace,__ace_shadowed__,0.5,0.8174,0.3489,0.44,0.66,0.84,0.69,NAN,0.19,0.19
107 | m1,m2,0.6607,0.8696,NAN,0.94,0.93,0.89,0.69,0.84,0.5,0.75
108 | latitude,longitude,0.0207,0.9564,NAN,0.91,0.94,0.97,0.67,0.96,0.67,0.78
109 | cols,rects,0.175,0.4521,NAN,0.41,0.66,0.4,-0.23,0.76,0.2,0.5
110 | resetSize,popupFeatures,0.0358,0.1615,NAN,0.13,0.49,0.58,0.29,0.75,0.15,0.42
111 | trim,separatedList,0.2045,0.336,NAN,0.3,0.36,0.22,0.02,NAN,0.15,0.23
112 | parentWindow,scanner,0.0682,0.0039,NAN,0.14,0.34,0.06,0.2,0.26,0.17,0.38
113 | targetPopup,popupResizable,0.2955,0.6204,NAN,0.51,0.63,0.85,0.45,NAN,0.07,0.43
114 | popupToolbar,popupDependent,0.173,0.6186,0.3865,0.63,0.75,0.9,0.34,NAN,0.36,0.61
115 | split,shim,0.2728,0.3595,0.242,0.21,0.45,0.16,0.04,0.35,0.4,0.6
116 | generalTab,advancedTab,0.3333,0.8696,0,0.63,0.76,0.91,0.77,0.98,0.27,0.59
117 | ELEMENT_NODE,TEXT_NODE,0.4808,0.8195,0.8147,0.82,0.79,0.87,0.34,0.94,0.58,0.67
118 | rlocalProtocol,rprotocol,0.6785,0.8881,0.9428,0.92,0.78,0.77,0.47,0.81,0.57,0.61
119 | listeners,cbs,0.125,0.1391,0.143,0.55,0.62,0.55,0.28,0.76,0.11,0.22
120 | user,person,0.9375,0.8912,0.8462,0.55,0.73,0.72,0.19,0.81,0.17,0.42
121 | onPlay,onPause,0.1608,0.8881,NAN,0.83,0.83,0.84,0.17,0.94,0.43,0.64
122 | re,destruct,0.0207,0.0652,0.0247,0.12,0.45,0.12,-0.04,0.46,0.12,0.19
123 | translateX,translateY,0.225,0.9478,NAN,0.98,0.97,0.97,0.64,0.94,0.9,0.95
124 | click,dblclick,0.2333,0.9653,NAN,0.75,0.75,0.66,0.39,0.82,0.62,0.62
125 | columns,requests,0.1362,0.1224,NAN,0.24,0.49,0.31,0.17,0.75,0.25,0.56
126 | oMatchesSelector,msMatchesSelector,0.6667,0.8912,0.4824,0.92,0.96,0.94,0.63,1.0,0.88,0.91
127 | a11,b12,0.2308,0.6788,0.1062,0.76,0.87,0.86,0.55,0.93,0.33,0.67
128 | pink,brown,0.0385,0.7391,NAN,0.38,0.74,0.82,0.63,0.91,0.0,0.4
129 | floor,abs,0.1138,0.4545,NAN,0.65,0.82,0.82,0.76,0.9,0.0,0.3
130 | math,miny,0.0227,0.3123,NAN,0.47,0.63,0.44,0.27,0.35,0.25,0.62
131 | toggle,isLength,0.0177,0.0123,0.1147,0.19,0.42,0.07,-0.04,0.32,0.12,0.44
132 | found,rawFunc,0.0385,0.0968,NAN,0.06,0.42,0.15,-0.03,NAN,0.29,0.5
133 | material,light,0.0832,0.3392,NAN,0.51,0.65,0.73,0.4,0.71,0.0,0.31
134 | controller,mutator,0.325,0.3739,0.242,0.23,0.31,0.22,0.11,0.55,0.2,0.45
135 | files,locations,0.5,0.5494,NAN,0.25,0.46,0.3,-0.04,0.82,0.22,0.39
136 | files,problems,0.0535,0.0684,NAN,0.29,0.56,0.49,0.22,0.78,0.38,0.5
137 | frames,markers,0.2115,0.5787,NAN,0.11,0.35,0.31,0.06,0.69,0.29,0.57
138 | objects,shortcuts,0.1138,0.1461,NAN,0.2,0.38,0.42,0.12,0.69,0.33,0.56
139 | a01,b11,0.173,0.4581,NAN,0.8,0.9,0.88,0.35,0.94,0.33,0.67
140 | defHeaders,defHeaderName,0.5,0.9763,0.8124,0.82,0.9,0.92,0.68,0.5,0.69,0.73
141 | right,bottom,0.1043,0.7827,NAN,0.69,0.84,0.89,0.68,0.93,0.0,0.42
142 | count,total,0.8462,0.7994,0.7913,0.57,0.69,0.65,0.03,0.83,0.2,0.6
143 | click,mouseup,0.5168,0.9131,0.143,0.51,0.67,0.73,0.34,0.9,0.0,0.36
144 | inputs,resources,0.3845,0.6186,NAN,0.29,0.45,0.32,-0.03,0.78,0.22,0.44
145 | advice,alternative,0.075,0.113,NAN,0.26,0.35,0.46,0.27,0.71,0.27,0.41
146 | abs,sqrt,0.0227,0.7154,NAN,0.61,0.81,0.83,0.85,0.89,0.0,0.38
147 | cells,segments,0.6155,0.7391,NAN,0.3,0.5,0.37,0.14,0.72,0.25,0.44
148 | place,written,0.025,0.113,NAN,0.15,0.41,0.2,0.11,0.42,0.14,0.43
149 | names,sources,0.575,0.5826,0.3489,0.37,0.54,0.52,0.11,0.81,0.29,0.5
150 | minx,ymax,0.1043,0.6739,0.0404,0.58,0.79,0.71,0.27,0.83,0.25,0.62
151 | segments,annotations,0.25,0.3178,NAN,0.27,0.39,0.38,0.21,0.74,0.18,0.45
152 | ELEMENT_ARRAY_BUFFER,bufferData,0.475,0.8696,NAN,0.51,0.75,0.9,0.52,0.55,0.0,0.25
153 | body,currants,0.15,0.2173,0.276,0.26,0.47,0.04,0.29,0.5,0.0,0.25
154 | cfg,conf,1,0.905,0.9428,0.43,0.66,0.37,0.25,0.73,0.25,0.5
155 | userA,userB,0.5192,0.9397,0.7958,0.95,0.94,0.95,0.4,0.97,0.8,0.9
156 | keyup,deferId,0.075,0.0869,NAN,0.19,0.46,0.32,-0.11,0.43,0.14,0.43
157 | columns,datasets,0.2045,0.4545,NAN,0.48,0.53,0.62,0.09,0.71,0.12,0.5
158 | getBorderWidth,getPadding,0.4317,0.8103,0.5999,0.79,0.76,0.9,0.6,0.97,0.36,0.54
159 | obj1,obj2,0.6362,0.7866,0.5967,0.97,0.96,0.95,0.57,0.85,0.75,0.88
160 | limit,exponent,0.1818,0.336,NAN,0.19,0.5,0.41,0.01,0.77,0.12,0.38
161 | camera,texture,0.0625,0.1738,NAN,0.23,0.56,0.6,0.35,0.62,0.14,0.5
162 | selected,active,0.8125,0.9131,0.892,0.37,0.54,0.51,0.33,0.77,0.12,0.44
163 | rows,columns,0.025,0.9217,NAN,0.72,0.78,0.81,0.34,0.86,0.29,0.43
164 | body,agg,0.05,0.0869,NAN,0.17,0.55,0.15,-0.01,0.36,0.0,0.38
165 | ui,secret,0.0625,0.0652,NAN,0.05,0.5,0.09,0.33,0.54,0.0,0.17
166 | dataAndEvents,deepDataAndEvents,0.7143,0.851,0.8374,0.91,0.97,0.97,0.86,0.7,0.76,0.76
167 | oDomRef,structuralType,0.0832,0,NAN,0.14,0.34,0.27,0.2,0.59,0.0,0.25
168 | i,targ,NAN,NAN,0.0346,0.22,0.68,0.19,0.28,0.45,0.0,0.12
169 | allocate,contextmenu,NAN,NAN,0.0831,0.02,0.27,0.15,0.18,0.47,0.18,0.45
170 | objects,records,NAN,NAN,0.8931,0.5,0.58,0.59,0.36,0.77,0.14,0.57
171 | λ0,φ0,NAN,NAN,0.242,0.92,0.91,0.89,0.41,NAN,0.5,0.75
172 | idx,indx,NAN,NAN,0.9617,0.67,0.7,0.38,0.06,0.77,0.75,0.75
173 | idx,ridx,NAN,NAN,0.3234,0.55,0.77,0.57,0.05,0.74,0.75,0.75
174 | ranges,codes,NAN,NAN,0.0404,0.49,0.55,0.27,0.18,0.67,0.33,0.58
175 | topLevelTarget,topLevelTargetID,NAN,NAN,0.7658,0.91,0.94,0.87,0.7,0.63,0.88,0.88
176 | store,storage,NAN,NAN,0.7382,0.72,0.71,0.59,0.09,0.77,0.71,0.71
177 | indices,positions,NAN,NAN,0.983,0.29,0.59,0.47,0.37,0.71,0.22,0.5
178 | files,filenames,NAN,NAN,0.9186,0.8,0.76,0.76,0.34,0.78,0.56,0.56
179 | λ0,λ1,NAN,NAN,0.1838,0.92,0.95,0.95,0.83,NAN,0.5,0.75
180 | cosφ0,cosφ,NAN,NAN,0.7597,0.94,0.97,0.9,0.74,NAN,0.8,0.8
181 | maxLine,maxLineLength,NAN,NAN,0.7225,0.82,0.88,0.88,0.59,0.83,0.54,0.54
182 | alignCenter,alignMiddle,NAN,NAN,0.5563,0.87,0.9,0.94,0.58,0.94,0.45,0.73
183 | targetFrame,targetFrameName,NAN,NAN,0.935,0.91,0.93,0.83,0.75,0.81,0.73,0.73
184 | angle,radians,NAN,NAN,0.9052,0.63,0.78,0.88,0.44,0.83,0.14,0.43
185 | events,rchecked,NAN,NAN,0.1819,0.19,0.31,0.18,-0.02,0.26,0.12,0.44
186 | img,thumb,NAN,NAN,0.7658,0.45,0.6,0.54,0.13,0.7,0.2,0.4
187 | PLACEHOLDER,vertexFormat,NAN,NAN,0.299,0.23,0.27,0.09,-0.22,0.44,0.0,0.46
188 | angle,theta,NAN,NAN,0.9186,0.52,0.74,0.83,0.31,0.85,0.0,0.5
189 | foo,bar,NAN,NAN,0.8168,0.71,0.83,0.81,0.5,0.8,0.0,0.5
190 | $behaviour,foldingRules,NAN,NAN,0.0831,0.51,0.64,0.89,0.64,NAN,0.08,0.46
191 | foo,abc,NAN,NAN,0.8196,0.44,0.78,0.58,0.4,0.76,0.0,0.5
192 | cols,domains,NAN,NAN,0.1819,0.19,0.47,0.13,0.04,0.72,0.29,0.43
193 | CallExpression,BlockStatement,NAN,NAN,0.242,0.62,0.67,0.79,0.35,0.65,0.07,0.5
194 | raw,movie,NAN,NAN,0.276,0.05,0.39,0.18,0.29,0.31,0.0,0.3
195 | err,er,NAN,NAN,0.8889,0.74,0.8,0.76,0.71,0.9,0.67,0.67
196 | res,resp,NAN,NAN,0.9224,0.68,0.81,0.63,0.52,0.9,0.75,0.75
197 | utils,util,NAN,NAN,0.9526,0.71,0.73,0.56,0.37,0.9,0.8,0.8
198 | remove,focus,NAN,NAN,0.1062,0.27,0.6,0.54,0.28,0.75,0.0,0.42
199 | items,records,NAN,NAN,0.7983,0.47,0.58,0.71,0.3,0.85,0.14,0.43
200 | bindBuffer,ELEMENT_ARRAY_BUFFER,NAN,NAN,0.276,0.49,0.79,0.9,0.65,0.63,0.05,0.28
201 | gray,silver,NAN,NAN,0.1147,0.4,0.77,0.83,0.22,0.82,0.0,0.33
202 | get,facets,NAN,NAN,0.208,0.09,0.48,0.31,-0.0,0.3,0.33,0.42
203 | files,images,NAN,NAN,0.807,0.47,0.6,0.49,-0.02,0.78,0.33,0.58
204 | nodeLinkFn,childLinkFn,NAN,NAN,0.8763,0.84,0.89,0.94,0.47,0.75,0.55,0.73
205 | b01,a03,NAN,NAN,0.0824,0.82,0.92,0.88,0.2,0.96,0.33,0.67
206 | styleSelectLabel,tag_h4,NAN,NAN,0.1377,0.15,0.38,0.64,0.39,NAN,0.06,0.22
207 | a24,a30,NAN,NAN,0.734,0.87,0.95,0.81,0.25,0.97,0.33,0.67
208 | lightgreen,lightgrey,NAN,NAN,0.2172,0.94,0.97,0.95,0.76,NAN,0.8,0.85
209 | id,sessionid,NAN,NAN,0.9169,0.41,0.55,0.45,0.12,0.71,0.22,0.22
210 | expect,assume,NAN,NAN,0.7983,0.46,0.65,0.48,0.25,0.58,0.0,0.5
211 | container,submenu,NAN,NAN,0.242,0.28,0.49,0.35,0.33,0.78,0.11,0.44
212 | images,streams,NAN,NAN,0.0404,0.27,0.46,0.35,-0.09,0.67,0.14,0.5
213 | btnModify,btnSetValue,NAN,NAN,0.8292,0.6,0.75,0.83,0.55,NAN,0.27,0.55
214 | setMinutes,setSeconds,NAN,NAN,0.0593,0.77,0.86,0.97,0.72,0.99,0.4,0.7
215 | styleSelectLabel,cssClassInputLabel,NAN,NAN,0.8547,0.64,0.74,0.94,0.74,NAN,0.44,0.67
216 | item,record,NAN,NAN,0.9662,0.29,0.57,0.66,0.18,0.77,0.0,0.33
217 | destroy,dispose,NAN,NAN,0.3022,0.48,0.63,0.5,0.5,0.85,0.29,0.64
218 | on,immediate,NAN,NAN,0.208,0.1,0.51,0.25,-0.03,0.42,0.0,0.11
219 | equals,same,NAN,NAN,0.7225,0.58,0.68,0.67,0.43,0.65,0.17,0.42
220 | wrapper,scroller,NAN,NAN,0.0404,0.39,0.52,0.46,0.62,0.69,0.38,0.62
221 | builtinConstants,buildinConstants,NAN,NAN,0.9023,0.91,0.92,0.93,0.34,0.97,0.94,0.97
222 |
--------------------------------------------------------------------------------
/varclr/data/__init__.py:
--------------------------------------------------------------------------------
1 | from varclr.data.dataset import RenamesDataModule
2 | from varclr.data.preprocessor import Preprocessor
3 |
4 | __all__ = [
5 | "RenamesDataModule",
6 | "Preprocessor",
7 | ]
8 |
--------------------------------------------------------------------------------
/varclr/data/dataset.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import random
4 | from typing import List, Optional, Text, Tuple
5 |
6 | import numpy as np
7 | import pytorch_lightning as pl
8 | import torch
9 | from torch.nn.utils.rnn import pad_sequence
10 | from torch.utils.data import DataLoader, Dataset, random_split
11 |
12 | from varclr.data.preprocessor import Preprocessor
13 | from varclr.data.vocab import Vocab
14 | from varclr.models.tokenizers import PretrainedTokenizer
15 |
16 |
17 | class Example(object):
18 | def __init__(self, sentence):
19 | self.sentence = sentence.strip().lower()
20 | self.embeddings = []
21 |
22 | def populate_embeddings(
23 | self, words, zero_unk, tokenization, ngrams, scramble_rate=0
24 | ):
25 | self.embeddings = []
26 | if tokenization == "ngrams":
27 | sentence = " " + self.sentence.strip() + " "
28 | for j in range(len(sentence)):
29 | idx = j
30 | gr = ""
31 | while idx < j + ngrams and idx < len(sentence):
32 | gr += sentence[idx]
33 | idx += 1
34 | if not len(gr) == ngrams:
35 | continue
36 | wd = Vocab.lookup(words, gr, zero_unk)
37 | if wd is not None:
38 | self.embeddings.append(wd)
39 | elif tokenization == "sp":
40 | arr = self.sentence.split()
41 | if scramble_rate:
42 | if random.random() <= scramble_rate:
43 | random.shuffle(arr)
44 | for i in arr:
45 | wd = Vocab.lookup(words, i, zero_unk)
46 | if wd is not None:
47 | self.embeddings.append(wd)
48 | else:
49 | raise NotImplementedError
50 | if len(self.embeddings) == 0:
51 | self.embeddings = [words[Vocab.unk_string]]
52 |
53 |
54 | class RenamesDataset(Dataset):
55 | def __init__(self, data_file: str, args, training=True) -> None:
56 | super().__init__()
57 | self.data_file = data_file
58 | self.tokenization = args.tokenization
59 | self.ngrams = args.ngrams
60 | self.scramble_rate = args.scramble_rate
61 | self.zero_unk = args.zero_unk
62 | self.training = training
63 | self.processor1, self.processor2 = Preprocessor.build(data_file, args)
64 | self.examples_pairs = self.read_examples()
65 | if not os.path.exists(args.vocab_path):
66 | print(f"Vocab not found. Creating from {data_file}")
67 | self.vocab = Vocab.build(self.examples_pairs, args)
68 | torch.save(self.vocab, args.vocab_path)
69 | else:
70 | self.vocab = torch.load(args.vocab_path)
71 |
72 | def __getitem__(self, i):
73 | self.examples_pairs[i][0].populate_embeddings(
74 | self.vocab,
75 | self.zero_unk,
76 | self.tokenization,
77 | self.ngrams,
78 | scramble_rate=self.scramble_rate if self.training else 0,
79 | )
80 | self.examples_pairs[i][1].populate_embeddings(
81 | self.vocab,
82 | self.zero_unk,
83 | self.tokenization,
84 | self.ngrams,
85 | scramble_rate=self.scramble_rate if self.training else 0,
86 | )
87 | return self.examples_pairs[i]
88 |
89 | def __len__(self):
90 | return len(self.examples_pairs)
91 |
92 | def read_examples(self):
93 | examples = []
94 | finished = set([]) # check for duplicates
95 | spliter = "," if "csv" in self.data_file else "\t"
96 | with io.open(self.data_file, "r", encoding="utf-8") as f:
97 | for idx, i in enumerate(f):
98 | if "csv" in self.data_file and idx == 0:
99 | # skip the first line in IdBench csv
100 | continue
101 | if i in finished:
102 | continue
103 | else:
104 | finished.add(i)
105 |
106 | i = i.split(spliter)
107 | if len(i[0].strip()) == 0 or len(i[1].strip()) == 0:
108 | continue
109 |
110 | i[0] = self.processor1(i[0])
111 | i[1] = self.processor2(i[1])
112 |
113 | if self.training:
114 | e = (Example(i[0]), Example(i[1]))
115 | else:
116 | if np.isnan(float(i[2])):
117 | continue
118 | e = (Example(i[0]), Example(i[1]), float(i[2]))
119 | examples.append(e)
120 | return examples
121 |
122 | @staticmethod
123 | def collate_fn(example_pairs):
124 | def torchify(batch: List[Example]):
125 | idxs = pad_sequence(
126 | [torch.tensor(ex.embeddings, dtype=torch.long) for ex in batch],
127 | batch_first=True,
128 | )
129 | lengths = torch.tensor([len(e.embeddings) for e in batch], dtype=torch.long)
130 | return idxs, lengths
131 |
132 | ret = torchify([pair[0] for pair in example_pairs]), torchify(
133 | [pair[1] for pair in example_pairs]
134 | )
135 | if len(example_pairs[0]) == 3:
136 | return ret[0], ret[1], torch.tensor([e[2] for e in example_pairs])
137 | else:
138 | return ret
139 |
140 | @staticmethod
141 | def collate_fn_transformers(example_pairs):
142 | def torchify(batch: List[Example]):
143 | tokenizer = PretrainedTokenizer.get_instance()
144 | ret = tokenizer(
145 | [ex.sentence for ex in batch], return_tensors="pt", padding=True
146 | )
147 | return ret["input_ids"], ret["attention_mask"]
148 |
149 | ret = torchify([pair[0] for pair in example_pairs]), torchify(
150 | [pair[1] for pair in example_pairs]
151 | )
152 | if len(example_pairs[0]) == 3:
153 | return ret[0], ret[1], torch.tensor([e[2] for e in example_pairs])
154 | else:
155 | return ret
156 |
157 |
158 | class RenamesDataModule(pl.LightningDataModule):
159 | def __init__(
160 | self, train_data_file: str, valid_data_file: str, test_data_files: str, args
161 | ):
162 | super().__init__()
163 | self.train_data_file = train_data_file
164 | self.valid_data_file = valid_data_file
165 | self.test_data_files = test_data_files.split(",")
166 | self.train_percent = args.train_percent
167 | self.args = args
168 |
169 | def prepare_data(self):
170 | assert os.path.exists(self.train_data_file)
171 | assert all(os.path.exists(test) for test in self.test_data_files)
172 |
173 | def setup(self, stage=None):
174 |
175 | # Assign train/val datasets for use in dataloaders
176 | if stage == "fit" or stage is None:
177 | self.train = RenamesDataset(self.train_data_file, self.args, training=True)
178 | if self.valid_data_file is None:
179 | self.train, self.valid = random_split(
180 | self.train, [len(self.train) - 1000, 1000]
181 | )
182 | self.valid.training = False
183 | self.valid.data_file = self.train_data_file
184 | else:
185 | self.valid = RenamesDataset(
186 | self.valid_data_file, self.args, training=False
187 | )
188 | num_for_training = int(len(self.train) * self.train_percent)
189 | self.train = random_split(
190 | self.train, [num_for_training, len(self.train) - num_for_training]
191 | )[0]
192 |
193 | # Assign test dataset for use in dataloader(s)
194 | if stage == "test" or stage is None:
195 | self.tests = [
196 | RenamesDataset(test_data_file, self.args, training=False)
197 | for test_data_file in self.test_data_files
198 | ]
199 |
200 | def train_dataloader(self):
201 | return DataLoader(
202 | self.train,
203 | batch_size=self.args.batch_size,
204 | num_workers=self.args.num_workers,
205 | collate_fn=RenamesDataset.collate_fn
206 | if self.args.model != "bert"
207 | else RenamesDataset.collate_fn_transformers,
208 | )
209 |
210 | def val_dataloader(self):
211 | return DataLoader(
212 | self.valid,
213 | batch_size=self.args.batch_size,
214 | num_workers=self.args.num_workers,
215 | collate_fn=RenamesDataset.collate_fn
216 | if self.args.model != "bert"
217 | else RenamesDataset.collate_fn_transformers,
218 | )
219 |
220 | def test_dataloader(self):
221 | return [
222 | DataLoader(
223 | test,
224 | batch_size=self.args.batch_size,
225 | num_workers=self.args.num_workers,
226 | collate_fn=RenamesDataset.collate_fn
227 | if self.args.model != "bert"
228 | else RenamesDataset.collate_fn_transformers,
229 | )
230 | for test in self.tests
231 | ]
232 |
--------------------------------------------------------------------------------
/varclr/data/preprocessor.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Tuple, Union
3 |
4 | from sacremoses import MosesTokenizer
5 |
6 | from varclr.models.tokenizers import Tokenizer
7 |
8 |
9 | class Preprocessor:
10 | @staticmethod
11 | def build(data_file, args) -> Tuple["Preprocessor", "Preprocessor"]:
12 | if "STS" in data_file:
13 | print(f"Using STS processor for {data_file}")
14 | return STSTextPreprocessor.from_args(args), STSTextPreprocessor.from_args(
15 | args
16 | )
17 | elif "idbench" in data_file:
18 | print(f"Using code processor for {data_file}")
19 | return CodePreprocessor.from_args(args), CodePreprocessor.from_args(args)
20 | elif "20k" in data_file:
21 | return Preprocessor(), Preprocessor()
22 | elif "nli" in data_file or "cs-cs" in data_file:
23 | print(f"Using NLI processor for {data_file}")
24 | return NLITextPreprocessor.from_args(args), NLITextPreprocessor.from_args(
25 | args
26 | )
27 | else:
28 | raise NotImplementedError
29 |
30 | def __call__(self, sentence):
31 | return sentence
32 |
33 |
34 | class NLITextPreprocessor(Preprocessor):
35 | def __init__(self, tokenization, sp_model) -> None:
36 | self.tokenization = tokenization
37 | if self.tokenization == "sp":
38 | self.tokenizer = Tokenizer.build(sp_model)
39 |
40 | @staticmethod
41 | def from_args(args) -> "NLITextPreprocessor":
42 | return NLITextPreprocessor(args.tokenization, args.sp_model)
43 |
44 | def __call__(self, sentence):
45 | sent = sentence.lower()
46 | if self.tokenization == "sp":
47 | sent = " ".join(self.tokenizer.encode(sent))
48 | return sent
49 |
50 |
51 | class STSTextPreprocessor(Preprocessor):
52 | def __init__(self, lang, tokenization, sp_model) -> None:
53 | self.moses = MosesTokenizer(lang=lang)
54 | self.tokenization = tokenization
55 | if self.tokenization == "sp":
56 | self.tokenizer = Tokenizer.build(sp_model)
57 |
58 | @staticmethod
59 | def from_args(args) -> "STSTextPreprocessor":
60 | return STSTextPreprocessor(args.tokenization, args.sp_model)
61 |
62 | def __call__(self, sentence):
63 | sent = " ".join(self.moses.tokenize(sentence))
64 | sent = sent.lower()
65 | if self.tokenization == "sp":
66 | sent = " ".join(self.tokenizer.encode(sent))
67 | return sent
68 |
69 |
70 | class CodePreprocessor(Preprocessor):
71 | def __init__(self, tokenization=None, sp_model=None):
72 | self.tokenization = tokenization
73 | if self.tokenization == "sp":
74 | self.tokenizer = Tokenizer.build(sp_model)
75 |
76 | @staticmethod
77 | def from_args(args) -> "CodePreprocessor":
78 | return CodePreprocessor(args.tokenization, args.sp_model)
79 |
80 | def __call__(self, var: Union[str, List[str]]):
81 | if isinstance(var, str):
82 | return self._process(var)
83 | elif isinstance(var, list) and all(isinstance(v, str) for v in var):
84 | return [self._process(v) for v in var]
85 | else:
86 | raise NotImplementedError
87 |
88 | def _process(self, var):
89 | var = var.replace("@", "")
90 | var = (
91 | re.sub("([a-z]|^)([A-Z]{1})", r"\1_\2", var)
92 | .lower()
93 | .replace("_", " ")
94 | .strip()
95 | )
96 | if self.tokenization == "sp":
97 | var = " ".join(self.tokenizer.encode(var))
98 | return var
99 |
--------------------------------------------------------------------------------
/varclr/data/vocab.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 |
4 | class Vocab:
5 | unk_string = "UUUNKKK"
6 |
7 | @staticmethod
8 | def build(examples, args):
9 | if args.tokenization == "ngrams":
10 | return Vocab.get_ngrams(examples, n=args.ngrams)
11 | elif args.tokenization == "sp":
12 | return Vocab.get_words(examples)
13 | else:
14 | raise NotImplementedError
15 |
16 | @staticmethod
17 | def get_ngrams(examples, max_len=200000, n=3):
18 | def update_counter(counter, sentence):
19 | word = " " + sentence.strip() + " "
20 | lis = []
21 | for j in range(len(word)):
22 | idx = j
23 | ngram = ""
24 | while idx < j + n and idx < len(word):
25 | ngram += word[idx]
26 | idx += 1
27 | if not len(ngram) == n:
28 | continue
29 | lis.append(ngram)
30 | counter.update(lis)
31 |
32 | counter = Counter()
33 |
34 | for i in examples:
35 | update_counter(counter, i[0].sentence)
36 | update_counter(counter, i[1].sentence)
37 |
38 | counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0:max_len]
39 |
40 | vocab = {}
41 | for i in counter:
42 | vocab[i[0]] = len(vocab)
43 |
44 | vocab[Vocab.unk_string] = len(vocab)
45 | return vocab
46 |
47 | @staticmethod
48 | def get_words(examples, max_len=200000):
49 | def update_counter(counter, sentence):
50 | counter.update(sentence.split())
51 |
52 | counter = Counter()
53 |
54 | for i in examples:
55 | update_counter(counter, i[0].sentence)
56 | update_counter(counter, i[1].sentence)
57 |
58 | counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0:max_len]
59 |
60 | vocab = {}
61 | for i in counter:
62 | vocab[i[0]] = len(vocab)
63 |
64 | vocab[Vocab.unk_string] = len(vocab)
65 | return vocab
66 |
67 | @staticmethod
68 | def lookup(words, w, zero_unk):
69 | w = w.lower()
70 | if w in words:
71 | return words[w]
72 | else:
73 | if zero_unk:
74 | return None
75 | else:
76 | return words[Vocab.unk_string]
77 |
--------------------------------------------------------------------------------
/varclr/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/squaresLab/VarCLR/30d7bfdcd518e69d3e39978c8957fe7fb7cd88ab/varclr/models/__init__.py
--------------------------------------------------------------------------------
/varclr/models/encoders.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import List, Union
3 |
4 | import gdown
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.utils.rnn import pack_padded_sequence as pack
9 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
10 | from torch.nn.utils.rnn import pad_sequence
11 | from transformers import AutoModel, AutoTokenizer
12 |
13 | from varclr.data.preprocessor import CodePreprocessor
14 | from varclr.data.vocab import Vocab
15 | from varclr.models import urls_pretrained_model
16 |
17 |
18 | class Encoder(nn.Module):
19 | @staticmethod
20 | def build(args) -> "Encoder":
21 | return {"avg": Averaging, "lstm": LSTM, "bert": BERT}[args.model].from_args(
22 | args
23 | )
24 |
25 | @staticmethod
26 | def from_pretrained(model_name: str, save_path: str = "saved/") -> "Encoder":
27 | return {
28 | "varclr-avg": Averaging,
29 | "varclr-lstm": LSTM,
30 | "varclr-codebert": BERT,
31 | "codebert": CodeBERT,
32 | }[model_name].load(save_path)
33 |
34 | @staticmethod
35 | def from_args(args) -> "Encoder":
36 | raise NotImplementedError
37 |
38 | @staticmethod
39 | def load(save_path: str) -> "Encoder":
40 | raise NotImplementedError
41 |
42 | def forward(self, idxs, lengths):
43 | raise NotImplementedError
44 |
45 | def encode(self, inputs: Union[str, List[str]]) -> torch.Tensor:
46 | raise NotImplementedError
47 |
48 | def score(
49 | self, inputx: Union[str, List[str]], inputy: Union[str, List[str]]
50 | ) -> List[float]:
51 | if type(inputx) != type(inputy):
52 | raise Exception("Input X and Y must be either string or list of strings.")
53 | if isinstance(inputx, list) and len(inputx) != len(inputy):
54 | raise Exception("Input X and Y must have the same length")
55 | embx = self.encode(inputx)
56 | emby = self.encode(inputy)
57 | return F.cosine_similarity(embx, emby).tolist()
58 |
59 | def cross_score(
60 | self, inputx: Union[str, List[str]], inputy: Union[str, List[str]]
61 | ) -> List[List[float]]:
62 | if isinstance(inputx, str):
63 | inputx = [inputx]
64 | if isinstance(inputy, str):
65 | inputy = [inputy]
66 | assert all(isinstance(inp, str) for inp in inputx)
67 | assert all(isinstance(inp, str) for inp in inputy)
68 | embx = self.encode(inputx)
69 | embx /= embx.norm(dim=1, keepdim=True)
70 | emby = self.encode(inputy)
71 | emby /= emby.norm(dim=1, keepdim=True)
72 | return (embx @ emby.t()).tolist()
73 |
74 | @staticmethod
75 | def decor_forward(model_forward):
76 | """Decorate an encoder's forward pass to deal with raw inputs."""
77 | processor = CodePreprocessor(
78 | tokenization="sp", sp_model=urls_pretrained_model.PRETRAINED_TOKENIZER
79 | )
80 |
81 | def torchify(batch):
82 | idxs = pad_sequence(
83 | [torch.tensor(ex, dtype=torch.long) for ex in batch],
84 | batch_first=True,
85 | )
86 | lengths = torch.tensor([len(e) for e in batch], dtype=torch.long)
87 | return idxs, lengths
88 |
89 | def tokenize_and_forward(self, inputs: Union[str, List[str]]) -> torch.Tensor:
90 | if isinstance(inputs, str):
91 | inputs = [inputs]
92 | var_ids = processor(inputs)
93 | batch = torchify(
94 | [
95 | [
96 | Vocab.lookup(self.vocab, w, True)
97 | for w in var.split()
98 | if Vocab.lookup(self.vocab, w, True) is not None
99 | ]
100 | or [self.vocab[Vocab.unk_string]]
101 | for var in var_ids
102 | ]
103 | )
104 | idxs, lengths = batch
105 | return model_forward(self, idxs, lengths)[0].detach()
106 |
107 | return tokenize_and_forward
108 |
109 | @staticmethod
110 | def decor_bert_forward(model_forward):
111 | """Decorate an encoder's forward pass to deal with raw inputs."""
112 | processor = CodePreprocessor()
113 | tokenizer = AutoTokenizer.from_pretrained(
114 | urls_pretrained_model.PRETRAINED_TOKENIZER
115 | )
116 |
117 | def tokenize_and_forward(self, inputs: Union[str, List[str]]) -> torch.Tensor:
118 | inputs = processor(inputs)
119 | return_dict = tokenizer(inputs, return_tensors="pt", padding=True)
120 | return model_forward(
121 | self, return_dict["input_ids"], return_dict["attention_mask"]
122 | )[0].detach()
123 |
124 | return tokenize_and_forward
125 |
126 |
127 | class Averaging(Encoder):
128 | def __init__(self, vocab_size, dim, dropout):
129 | super().__init__()
130 | self.embedding = nn.Embedding(vocab_size, dim)
131 | self.dropout = dropout
132 |
133 | @staticmethod
134 | def from_args(args):
135 | return Averaging(args.vocab_size, args.dim, args.dropout)
136 |
137 | @staticmethod
138 | def load(save_path: str) -> Encoder:
139 | gdown.cached_download(
140 | urls_pretrained_model.PRETRAINED_AVG_URL,
141 | os.path.join(save_path, "lstm.zip"),
142 | md5=urls_pretrained_model.PRETRAINED_AVG_MD5,
143 | postprocess=gdown.extractall,
144 | )
145 | state_dict = torch.load(
146 | os.path.join(
147 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "model"
148 | ),
149 | map_location=torch.device("cpu"),
150 | )
151 | vocab_size, dim = state_dict["encoder.embedding.weight"].shape
152 | m = nn.Module()
153 | m.encoder = Averaging(vocab_size, dim, 0)
154 | m.load_state_dict(state_dict)
155 | m = m.encoder
156 | # HACK: for inference
157 | vocab = torch.load(
158 | os.path.join(
159 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "vocab"
160 | )
161 | )
162 | m.vocab = vocab
163 | return m
164 |
165 | def forward(self, idxs, lengths):
166 | word_embs = self.embedding(idxs)
167 | word_embs = F.dropout(word_embs, p=self.dropout, training=self.training)
168 |
169 | bs, max_len, _ = word_embs.shape
170 | mask = (
171 | torch.arange(max_len).to(word_embs.device).expand(bs, max_len)
172 | < lengths.unsqueeze(1)
173 | ).float()
174 | pooled = (word_embs * mask.unsqueeze(dim=2)).sum(dim=1)
175 | pooled = pooled / lengths.unsqueeze(dim=1)
176 |
177 | return pooled, (word_embs, mask)
178 |
179 | encode = Encoder.decor_forward(forward)
180 |
181 |
182 | class LSTM(Encoder):
183 | def __init__(self, hidden_dim, dropout, vocab_size, dim):
184 | super(LSTM, self).__init__()
185 |
186 | self.hidden_dim = hidden_dim
187 | self.dropout = dropout
188 |
189 | self.register_buffer("e_hidden_init", torch.zeros(2, 1, hidden_dim))
190 | self.register_buffer("e_cell_init", torch.zeros(2, 1, hidden_dim))
191 |
192 | self.embedding = nn.Embedding(vocab_size, dim)
193 | self.lstm = nn.LSTM(
194 | dim,
195 | hidden_dim,
196 | num_layers=1,
197 | bidirectional=True,
198 | batch_first=True,
199 | )
200 |
201 | @staticmethod
202 | def from_args(args):
203 | return LSTM(args.hidden_dim, args.dropout, args.vocab_size, args.dim)
204 |
205 | @staticmethod
206 | def load(save_path: str) -> Encoder:
207 | gdown.cached_download(
208 | urls_pretrained_model.PRETRAINED_LSTM_URL,
209 | os.path.join(save_path, "lstm.zip"),
210 | md5=urls_pretrained_model.PRETRAINED_LSTM_MD5,
211 | postprocess=gdown.extractall,
212 | )
213 | state_dict = torch.load(
214 | os.path.join(
215 | save_path, urls_pretrained_model.PRETRAINED_LSTM_FOLDER, "model"
216 | ),
217 | map_location=torch.device("cpu"),
218 | )
219 | hidden_dim = state_dict["encoder.e_hidden_init"].shape[2]
220 | vocab_size, dim = state_dict["encoder.embedding.weight"].shape
221 | m = nn.Module()
222 | m.encoder = LSTM(hidden_dim, 0, vocab_size, dim)
223 | m.load_state_dict(state_dict)
224 | m = m.encoder
225 | # HACK: for inference
226 | vocab = torch.load(
227 | os.path.join(
228 | save_path, urls_pretrained_model.PRETRAINED_AVG_FOLDER, "vocab"
229 | )
230 | )
231 | m.vocab = vocab
232 | return m
233 |
234 | def forward(self, inputs, lengths):
235 | bsz, max_len = inputs.size()
236 | e_hidden_init = self.e_hidden_init.expand(2, bsz, self.hidden_dim).contiguous()
237 | e_cell_init = self.e_cell_init.expand(2, bsz, self.hidden_dim).contiguous()
238 | lens, indices = torch.sort(lengths, 0, True)
239 |
240 | in_embs = self.embedding(inputs)
241 | in_embs = F.dropout(in_embs, p=self.dropout, training=self.training)
242 |
243 | all_hids, (enc_last_hid, _) = self.lstm(
244 | pack(in_embs[indices], lens.tolist(), batch_first=True),
245 | (e_hidden_init, e_cell_init),
246 | )
247 |
248 | _, _indices = torch.sort(indices, 0)
249 | all_hids = unpack(all_hids, batch_first=True)[0][_indices]
250 |
251 | bs, max_len, _ = all_hids.shape
252 | mask = (
253 | torch.arange(max_len).to(in_embs.device).expand(bs, max_len)
254 | < lengths.unsqueeze(1)
255 | ).float()
256 | pooled = (all_hids * mask.unsqueeze(dim=2)).sum(dim=1)
257 | pooled = pooled / lengths.unsqueeze(dim=1)
258 |
259 | return pooled, (all_hids, mask)
260 |
261 | encode = Encoder.decor_forward(forward)
262 |
263 |
264 | class BERT(Encoder):
265 | """VarCLR-CodeBERT Model."""
266 |
267 | def __init__(self, bert_model: str, last_n_layer_output: int = 4):
268 | super().__init__()
269 | self.transformer = AutoModel.from_pretrained(bert_model)
270 | self.last_n_layer_output = last_n_layer_output
271 |
272 | @staticmethod
273 | def from_args(args):
274 | return BERT(args.bert_model, args.last_n_layer_output)
275 |
276 | @staticmethod
277 | def load(save_path: str) -> "BERT":
278 | gdown.cached_download(
279 | urls_pretrained_model.PRETRAINED_CODEBERT_URL,
280 | os.path.join(save_path, "bert.zip"),
281 | md5=urls_pretrained_model.PRETRAINED_CODEBERT_MD5,
282 | postprocess=gdown.extractall,
283 | )
284 | return BERT(
285 | bert_model=os.path.join(
286 | save_path, urls_pretrained_model.PRETRAINED_CODEBERT_FOLDER
287 | )
288 | )
289 |
290 | def forward(self, input_ids, attention_mask):
291 | output = self.transformer(
292 | input_ids=input_ids,
293 | attention_mask=attention_mask,
294 | output_hidden_states=True,
295 | )
296 | all_hids = output.hidden_states
297 | pooled = all_hids[-self.last_n_layer_output][:, 0]
298 |
299 | return pooled, (all_hids, attention_mask)
300 |
301 | encode = Encoder.decor_bert_forward(forward)
302 |
303 |
304 | class CodeBERT(BERT):
305 | """Original CodeBERT model https://github.com/microsoft/CodeBERT."""
306 |
307 | @staticmethod
308 | def load(save_path: str) -> BERT:
309 | return BERT(bert_model="microsoft/codebert-base")
310 |
--------------------------------------------------------------------------------
/varclr/models/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class NCESoftmaxLoss(nn.Module):
6 | """Softmax cross-entropy loss (a.k.a., info-NCE loss in CPC paper)"""
7 |
8 | def __init__(self, nce_t):
9 | super(NCESoftmaxLoss, self).__init__()
10 | self.loss = nn.CrossEntropyLoss(reduction="none")
11 | self.nce_t = nce_t
12 |
13 | def forward(self, x_ret, y_ret):
14 | x, _ = x_ret
15 | y, _ = y_ret
16 | bsz = x.shape[0]
17 | scores = (
18 | (x / torch.norm(x, dim=1, keepdim=True))
19 | @ (y / torch.norm(y, dim=1, keepdim=True)).t()
20 | / self.nce_t
21 | )
22 | label = torch.arange(bsz, device=x.device)
23 | loss = self.loss(scores, label) + self.loss(scores.t(), label)
24 | return loss
25 |
--------------------------------------------------------------------------------
/varclr/models/model.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytorch_lightning as pl
4 | import torch
5 | import torch.nn.functional as F
6 | from scipy.stats import pearsonr, spearmanr
7 | from torch import optim
8 | from varclr.models.encoders import Encoder
9 | from varclr.models.loss import NCESoftmaxLoss
10 |
11 |
12 | class Model(pl.LightningModule):
13 | def __init__(self, args):
14 | super(Model, self).__init__()
15 | self.args = args
16 |
17 | self.dropout = args.dropout
18 | self.loss = NCESoftmaxLoss(args.nce_t)
19 | args.vocab_size = len(torch.load(args.vocab_path))
20 | args.parentmodel = self
21 | self.encoder = Encoder.build(args)
22 |
23 | def _forward(self, batch):
24 | (x_idxs, x_lengths), (y_idxs, y_lengths) = batch
25 | x_ret = self.encoder(x_idxs, x_lengths)
26 | y_ret = self.encoder(y_idxs, y_lengths)
27 |
28 | return self.loss(x_ret, y_ret)
29 |
30 | def _score(self, batch):
31 | (x_idxs, x_lengths), (y_idxs, y_lengths) = batch
32 | x_pooled, _ = self.encoder(x_idxs, x_lengths)
33 | y_pooled, _ = self.encoder(y_idxs, y_lengths)
34 | return F.cosine_similarity(x_pooled, y_pooled)
35 |
36 | def training_step(self, batch, batch_idx):
37 | loss = self._forward(batch).mean()
38 | self.log("loss/train", loss)
39 | return loss
40 |
41 | def _unlabeled_eval_step(self, batch, batch_idx):
42 | loss = self._forward(batch)
43 | return dict(loss=loss.detach().cpu())
44 |
45 | def _labeled_eval_step(self, batch, batch_idx):
46 | *batch, labels = batch
47 | scores = self._score(batch)
48 | return dict(scores=scores.detach().cpu(), labels=labels.detach().cpu())
49 |
50 | def _shared_eval_step(self, batch, batch_idx):
51 | if len(batch) == 3:
52 | return self._labeled_eval_step(batch, batch_idx)
53 | elif len(batch) == 2:
54 | return self._unlabeled_eval_step(batch, batch_idx)
55 |
56 | def _unlabeled_epoch_end(self, outputs, prefix):
57 | loss = torch.cat([o["loss"] for o in outputs]).mean()
58 | self.log(f"loss/{prefix}", loss)
59 |
60 | def _labeled_epoch_end(self, outputs, prefix):
61 | scores = torch.cat([o["scores"] for o in outputs]).tolist()
62 | labels = torch.cat([o["labels"] for o in outputs]).tolist()
63 | self.log(f"pearsonr/{prefix}", pearsonr(scores, labels)[0])
64 | self.log(f"spearmanr/{prefix}", spearmanr(scores, labels).correlation)
65 |
66 | def _shared_epoch_end(self, outputs, prefix):
67 | if "labels" in outputs[0]:
68 | self._labeled_epoch_end(outputs, prefix)
69 | else:
70 | self._unlabeled_epoch_end(outputs, prefix)
71 |
72 | def validation_step(self, batch, batch_idx):
73 | return self._shared_eval_step(batch, batch_idx)
74 |
75 | def test_step(self, batch, batch_idx, dataloader_idx=0):
76 | return self._shared_eval_step(batch, batch_idx)
77 |
78 | def validation_epoch_end(self, outputs):
79 | self._shared_epoch_end(
80 | outputs,
81 | f"val_{os.path.basename(self.datamodule.val_dataloader().dataset.data_file)}",
82 | )
83 |
84 | def test_epoch_end(self, outputs):
85 | if isinstance(outputs[0], list):
86 | for idx, subset_outputs in enumerate(outputs):
87 | self._shared_epoch_end(
88 | subset_outputs,
89 | f"test_{os.path.basename(self.datamodule.test_dataloader()[idx].dataset.data_file)}",
90 | )
91 | else:
92 | self._shared_epoch_end(
93 | outputs,
94 | f"test_{os.path.basename(self.datamodule.test_dataloader().dataset.data_file)}",
95 | )
96 |
97 | def configure_optimizers(self):
98 | return {"bert": optim.AdamW}.get(self.args.model, optim.Adam)(
99 | self.parameters(), lr=self.args.lr
100 | )
101 |
--------------------------------------------------------------------------------
/varclr/models/tokenizers.py:
--------------------------------------------------------------------------------
1 | import sentencepiece as spm
2 | from transformers import AutoTokenizer
3 |
4 |
5 | class Tokenizer:
6 | @staticmethod
7 | def build(sp_model):
8 | if "sp.20k.model" in sp_model:
9 | return SPTokenizer(sp_model)
10 | elif "bert" in sp_model:
11 | return PretrainedTokenizer(sp_model)
12 | elif "split" in sp_model:
13 | return SplitTokenizer()
14 | else:
15 | raise NotImplementedError
16 |
17 | def encode(self, text):
18 | raise NotImplementedError
19 |
20 |
21 | class SplitTokenizer(Tokenizer):
22 | def encode(self, text):
23 | return text.strip().split()
24 |
25 |
26 | class SPTokenizer(Tokenizer):
27 | def __init__(self, model_path) -> None:
28 | self.sp = spm.SentencePieceProcessor()
29 | self.sp.Load(model_path)
30 |
31 | def encode(self, text):
32 | return self.sp.EncodeAsPieces(text)
33 |
34 |
35 | class PretrainedTokenizer(Tokenizer):
36 |
37 | _instance = None
38 |
39 | @staticmethod
40 | def get_instance():
41 | return PretrainedTokenizer._instance
42 |
43 | @staticmethod
44 | def set_instance(tokenizer_name):
45 | PretrainedTokenizer._instance = AutoTokenizer.from_pretrained(tokenizer_name)
46 |
47 | def __init__(self, tokenizer_name) -> None:
48 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
49 |
50 | def encode(self, text):
51 | return list(
52 | map(
53 | str,
54 | self.tokenizer.encode(text, add_special_tokens=False, truncation=True),
55 | )
56 | )
57 |
--------------------------------------------------------------------------------
/varclr/models/urls_pretrained_model.py:
--------------------------------------------------------------------------------
1 | PRETRAINED_TOKENIZER = "microsoft/codebert-base"
2 |
3 | PRETRAINED_CODEBERT_URL = (
4 | "https://drive.google.com/uc?id=1xl8kdQtJ7ke4jyv5kHDiOc5dScPTTKzg"
5 | )
6 | PRETRAINED_CODEBERT_FOLDER = "varclr_bert"
7 | PRETRAINED_CODEBERT_MD5 = "3844bd6e76a928084b0d742ac120a91c"
8 |
9 | PRETRAINED_AVG_URL = "https://drive.google.com/uc?id=1IFWvFQ2YKvCNRroy2RBqwSeQPGhHShX7"
10 | PRETRAINED_AVG_FOLDER = "varclr_avg"
11 | PRETRAINED_AVG_MD5 = "97ca667fac013b9a93fb87e91c0c3a0c"
12 |
13 | PRETRAINED_LSTM_URL = "https://drive.google.com/uc?id=1GZ9v0Zt4RazR1STBac8W116F-0bsRLzf"
14 | PRETRAINED_LSTM_FOLDER = "varclr_lstm"
15 | PRETRAINED_LSTM_MD5 = "a368f514ec16e45a58cbc94c67c67b80"
16 |
--------------------------------------------------------------------------------
/varclr/pretrain.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import random
4 |
5 | import numpy as np
6 | import pytorch_lightning as pl
7 | import torch
8 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
9 | from pytorch_lightning.loggers import WandbLogger
10 | from transformers import AutoModel
11 |
12 | from varclr.data.dataset import RenamesDataModule
13 | from varclr.models.model import Model
14 | from varclr.models.tokenizers import PretrainedTokenizer
15 | from varclr.utils.options import add_options
16 |
17 | if __name__ == "__main__":
18 |
19 | parser = argparse.ArgumentParser()
20 | add_options(parser)
21 | args = parser.parse_args()
22 | random.seed(args.seed)
23 | np.random.seed(args.seed)
24 | torch.manual_seed(args.seed)
25 |
26 | dm = RenamesDataModule(
27 | args.train_data_file, args.valid_data_file, args.test_data_files, args
28 | )
29 | if not os.path.exists(args.vocab_path):
30 | dm.setup()
31 |
32 | model = Model(args)
33 | if args.load_file is not None:
34 | model = model.load_from_checkpoint(args.load_file, args=args, strict=False)
35 | model.datamodule = dm
36 |
37 | if not args.test and "bert" in args.sp_model and args.model != "bert":
38 | # Load pre-trained word embeddings from bert
39 | bert = AutoModel.from_pretrained(args.sp_model)
40 | for word, idx in torch.load(args.vocab_path).items():
41 | try:
42 | model.encoder.embedding.weight.data[
43 | idx
44 | ] = bert.embeddings.word_embeddings.weight.data[int(word)]
45 | except ValueError:
46 | pass
47 | del bert
48 | if "bert" in args.model:
49 | PretrainedTokenizer.set_instance(args.bert_model)
50 |
51 | if args.valid_data_file is not None:
52 | callbacks = [
53 | EarlyStopping(
54 | monitor=f"spearmanr/val_{os.path.basename(dm.valid_data_file)}",
55 | mode="max",
56 | patience=args.patience,
57 | ),
58 | ModelCheckpoint(
59 | monitor=f"spearmanr/val_{os.path.basename(dm.valid_data_file)}",
60 | mode="max",
61 | ),
62 | ]
63 | else:
64 | callbacks = [
65 | EarlyStopping(
66 | monitor=f"loss/val_{os.path.basename(dm.train_data_file)}",
67 | patience=args.patience,
68 | ),
69 | ModelCheckpoint(monitor=f"loss/val_{os.path.basename(dm.train_data_file)}"),
70 | ]
71 |
72 | wandb_logger = WandbLogger(name=args.name, project="varclr", log_model=True)
73 | wandb_logger.log_hyperparams(args)
74 | args = argparse.Namespace(**wandb_logger.experiment.config)
75 | trainer = pl.Trainer(
76 | max_epochs=args.epochs,
77 | logger=wandb_logger,
78 | gpus=args.gpu,
79 | auto_select_gpus=args.gpu > 0,
80 | gradient_clip_val=args.grad_clip,
81 | callbacks=callbacks,
82 | progress_bar_refresh_rate=10,
83 | val_check_interval=0.25,
84 | limit_train_batches=args.limit_train_batches,
85 | )
86 |
87 | if not args.test:
88 | trainer.fit(model, datamodule=dm)
89 | # will automatically load and test the best checkpoint instead of the last model
90 | trainer.test(datamodule=dm)
91 | else:
92 | # save in hf transformer ckpt format
93 | trainer.test(model, datamodule=dm)
94 |
--------------------------------------------------------------------------------
/varclr/utils/find_nn.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from infer import MockArgs
4 | from varclr.data.preprocessor import CodePreprocessor
5 |
6 | if __name__ == "__main__":
7 | ret = torch.load("saved")
8 | vars, embs = ret["vars"], ret["embs"]
9 | var2idx = dict([(var, idx) for idx, var in enumerate(vars)])
10 | # while (line := input()) != "":
11 | processor = CodePreprocessor(MockArgs())
12 | for line in [
13 | "substr",
14 | "item",
15 | "count",
16 | "rows",
17 | "setInterval",
18 | "minText",
19 | "files",
20 | "miny",
21 | ]:
22 | line = "".join(
23 | [
24 | word.capitalize() if idx > 0 else word
25 | for idx, word in enumerate(processor(line.strip()).split())
26 | ]
27 | )
28 | if line not in var2idx:
29 | print("variable not found")
30 | continue
31 | result = torch.topk(embs @ embs[var2idx[line]], k=21)
32 | print([vars[idx] for idx in result.indices][1:])
33 |
--------------------------------------------------------------------------------
/varclr/utils/gen_typos.py:
--------------------------------------------------------------------------------
1 | import nlpaug.augmenter.char as nac
2 | import numpy as np
3 |
4 | if __name__ == "__main__":
5 | with open("var.txt") as f:
6 | variables = [line.strip() for line in f.readlines()]
7 | np.random.seed(42)
8 | variables = np.random.choice(variables, 1024)
9 | aug = nac.KeyboardAug(
10 | aug_char_max=1,
11 | include_special_char=False,
12 | include_numeric=False,
13 | include_upper_case=False,
14 | )
15 | with open("typo_corr.txt", "w") as f, open("typo_var.txt", "w") as f_var:
16 | for variable in variables:
17 | aug_var = aug.augment(variable)
18 | variable, aug_var = variable.replace(" ", ""), aug_var.replace(" ", "")
19 | f.write(f"{aug_var} {variable}\n")
20 | f_var.write(f"{aug_var}\n")
21 |
--------------------------------------------------------------------------------
/varclr/utils/infer.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from tqdm import tqdm
5 | from transformers import AutoTokenizer, AutoModel
6 |
7 | from varclr.data.preprocessor import CodePreprocessor
8 |
9 |
10 | def forward(model, input_ids, attention_mask):
11 | output = model(
12 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True
13 | )
14 | all_hids = output.hidden_states
15 | pooled = all_hids[-4][:, 0]
16 | return pooled
17 |
18 |
19 | class MockArgs:
20 | def __init__(self):
21 | self.tokenization = ""
22 |
23 |
24 | def batcher(batch_size):
25 | uniq = set()
26 | with open(sys.argv[1]) as f:
27 | vars = []
28 | for var in f:
29 | var = processor(var.strip())
30 | if var not in uniq:
31 | uniq.add(var)
32 | vars.append(var)
33 | if len(vars) == batch_size:
34 | yield vars
35 | vars = []
36 | yield vars
37 |
38 |
39 | if __name__ == "__main__":
40 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
41 | tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
42 | model = AutoModel.from_pretrained("bert_saved/")
43 | model.to(device)
44 | processor = CodePreprocessor(MockArgs())
45 | ret_dict = dict(vars=[], embs=[])
46 | for idx, vars in enumerate(tqdm(batcher(64))):
47 | ret = tokenizer(vars, return_tensors="pt", padding=True)
48 | embs = (
49 | forward(
50 | model, ret["input_ids"].to(device), ret["attention_mask"].to(device)
51 | )
52 | .detach()
53 | .cpu()
54 | )
55 | ret_dict["vars"].extend(
56 | [
57 | "".join(
58 | [
59 | word.capitalize() if idx > 0 else word
60 | for idx, word in enumerate(var.split())
61 | ]
62 | )
63 | for var in vars
64 | ]
65 | )
66 | ret_dict["embs"].extend(embs)
67 | ret_dict["embs"] = torch.stack(ret_dict["embs"])
68 | print(len(ret_dict["vars"]))
69 | print(ret_dict["embs"].shape)
70 | torch.save(ret_dict, "saved")
71 |
--------------------------------------------------------------------------------
/varclr/utils/infer_avg.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import random
4 | import numpy as np
5 | import torch
6 | from torch.nn.utils.rnn import pad_sequence
7 | from tqdm import tqdm
8 | from transformers import AutoTokenizer, AutoModel
9 |
10 | from models import Model
11 | from varclr.data.vocab import Vocab
12 | from varclr.data.preprocessor import CodePreprocessor
13 | from varclr.utils.options import add_options
14 |
15 |
16 | def forward(model, input_ids, attention_mask):
17 | output = model(
18 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True
19 | )
20 | all_hids = output.hidden_states
21 | pooled = all_hids[-4][:, 0]
22 | return pooled
23 |
24 |
25 | class MockArgs:
26 | def __init__(self):
27 | self.tokenization = ""
28 |
29 |
30 | def batcher(batch_size):
31 | uniq = set()
32 | with open("var.txt") as f:
33 | vars = []
34 | for var in f:
35 | var_id = processor(var.strip())
36 | var = processor2(var.strip())
37 | if var_id not in uniq:
38 | uniq.add(var_id)
39 | vars.append((var_id, var))
40 | if len(vars) == batch_size:
41 | yield zip(*vars)
42 | vars = []
43 | yield zip(*vars)
44 |
45 |
46 | if __name__ == "__main__":
47 | parser = argparse.ArgumentParser()
48 | add_options(parser)
49 | args = parser.parse_args()
50 | random.seed(args.seed)
51 | np.random.seed(args.seed)
52 | torch.manual_seed(args.seed)
53 | tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
54 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55 | model = Model(args)
56 | model = model.load_from_checkpoint(args.load_file, args=args, strict=False)
57 | model = model.to(device)
58 | model.eval()
59 | vocab = torch.load(args.vocab_path)
60 |
61 | processor = CodePreprocessor(args)
62 | processor2 = CodePreprocessor(MockArgs())
63 | ret_dict = dict(vars=[], embs=[])
64 |
65 | def torchify(batch):
66 | idxs = pad_sequence(
67 | [torch.tensor(ex, dtype=torch.long) for ex in batch],
68 | batch_first=True,
69 | )
70 | lengths = torch.tensor([len(e) for e in batch], dtype=torch.long)
71 | return idxs, lengths
72 |
73 | for var_ids, vars in tqdm(batcher(64)):
74 | batch = torchify(
75 | [
76 | [
77 | Vocab.lookup(vocab, w, args.zero_unk)
78 | for w in var.split()
79 | if Vocab.lookup(vocab, w, args.zero_unk) is not None
80 | ]
81 | or [vocab[Vocab.unk_string]]
82 | for var in var_ids
83 | ]
84 | )
85 | x_idxs, x_lengths = batch
86 | ret = model.encoder(x_idxs.to(device), x_lengths.to(device))
87 | embs, _ = ret
88 | embs = embs.detach().cpu()
89 | ret_dict["vars"].extend(
90 | [
91 | "".join(
92 | [
93 | word.capitalize() if idx > 0 else word
94 | for idx, word in enumerate(var.split())
95 | ]
96 | )
97 | for var in vars
98 | ]
99 | )
100 | ret_dict["embs"].extend(embs)
101 | ret_dict["embs"] = torch.stack(ret_dict["embs"])
102 | print(len(ret_dict["vars"]))
103 | print(ret_dict["embs"].shape)
104 | torch.save(ret_dict, "saved_lstm")
105 |
--------------------------------------------------------------------------------
/varclr/utils/infer_ft_cbow.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | from tqdm import tqdm
5 |
6 | from utils import CodePreprocessor
7 | from varclr.data.preprocessor import CodePreprocessor
8 |
9 |
10 | def forward(model, input_ids, attention_mask):
11 | output = model(
12 | input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True
13 | )
14 | all_hids = output.hidden_states
15 | pooled = all_hids[-4][:, 0]
16 | return pooled
17 |
18 |
19 | class MockArgs:
20 | def __init__(self):
21 | self.tokenization = ""
22 |
23 |
24 | def batcher(batch_size):
25 | uniq = set()
26 | with open(sys.argv[1]) as f:
27 | vars = []
28 | for uncanon_var in f:
29 | uncanon_var = uncanon_var.strip()
30 | var = processor(uncanon_var)
31 | if var not in uniq:
32 | uniq.add(var)
33 | vars.append((var, uncanon_var))
34 | if len(vars) == batch_size:
35 | yield list(zip(*vars))
36 | vars = []
37 | yield list(zip(*vars))
38 |
39 |
40 | def read_embs(fname):
41 | all_embs = {}
42 | with open(fname) as f:
43 | for line in f:
44 | if not '"ID:' in line:
45 | continue
46 | name, *emb = line.strip().split()
47 | name = name[1 : 1 + name[1:].index('"')]
48 | all_embs[name] = torch.tensor(list(map(float, emb)))
49 | return all_embs
50 |
51 |
52 | if __name__ == "__main__":
53 | processor = CodePreprocessor(MockArgs())
54 | ret_dict = dict(vars=[], embs=[])
55 | all_embs = read_embs(sys.argv[2])
56 | for vars, uncanon_vars in tqdm(batcher(64)):
57 | embs = (all_embs[f"ID:{v}"] for v in uncanon_vars)
58 | ret_dict["vars"].extend(
59 | [
60 | "".join(
61 | [
62 | word.capitalize() if idx > 0 else word
63 | for idx, word in enumerate(var.split())
64 | ]
65 | )
66 | for var in vars
67 | ]
68 | )
69 | ret_dict["embs"].extend(embs)
70 | ret_dict["embs"] = torch.stack(ret_dict["embs"])
71 | print(len(ret_dict["vars"]))
72 | print(ret_dict["embs"].shape)
73 | torch.save(ret_dict, "saved_ft")
74 |
--------------------------------------------------------------------------------
/varclr/utils/options.py:
--------------------------------------------------------------------------------
1 | def add_options(parser):
2 | # fmt: off
3 | # Dataset
4 | parser.add_argument("--train-data-file", default="cs-cs.var.tok.txt", help="training data")
5 | parser.add_argument("--valid-data-file", default=None, type=str, help="validation data")
6 | parser.add_argument("--test-data-files", default="varclr/benchmarks/idbench/small_pair_wise.csv,varclr/benchmarks/idbench/medium_pair_wise.csv,varclr/benchmarks/idbench/large_pair_wise.csv", help="test data")
7 | parser.add_argument("--zero-unk", default=1, type=int, help="whether to ignore unknown tokens")
8 | parser.add_argument("--ngrams", default=3, type=int, help="whether to use character n-grams")
9 | parser.add_argument("--tokenization", default="sp", type=str, choices=["sp", "ngrams"], help="which tokenization to use")
10 | parser.add_argument("--sp-model", default="microsoft/codebert-base-mlm", help="SP model to load for evaluation")
11 | parser.add_argument("--vocab-path", default="cs-cs.var.tok.txt.codebert.vocab", type=str, help="Path to vocabulary")
12 | parser.add_argument("--num-workers", default=4, type=int, help="Path to vocabulary")
13 |
14 | # Model
15 | parser.add_argument("--model", default="avg", choices=["avg", "lstm", "attn", "bert"], help="type of base model to train.")
16 | parser.add_argument("--bert-model", default="microsoft/codebert-base-mlm", help="type of bert model to load.")
17 | parser.add_argument("--dim", default=768, type=int, help="dimension of input embeddings")
18 | parser.add_argument("--hidden-dim", default=150, type=int, help="hidden dim size of LSTM")
19 | parser.add_argument("--scramble-rate", default=0, type=float, help="rate of scrambling in for LSTM")
20 | parser.add_argument("--delta", default=0.4, type=float, help="margin size for margin ranking loss")
21 | parser.add_argument("--nce-t", default=0.05, type=float, help="temperature for noise contrastive estimation loss")
22 | parser.add_argument("--temperature", default=100, type=float, help="temperature for biattn scorer")
23 | parser.add_argument("--last-n-layer-output", default=1, type=int, help="last layer representation used as output")
24 |
25 | # Training
26 | parser.add_argument("--name", default="Ours-FT", help="method name")
27 | parser.add_argument("--gpu", default=1, type=int, help="whether to train on gpu")
28 | parser.add_argument("--grad-clip", default=1., type=float, help='clip threshold of gradients')
29 | parser.add_argument("--epochs", default=300, type=int, help="number of epochs to train")
30 | parser.add_argument("--limit-train-batches", default=1.0, type=float, help="number of batches for each training epoch")
31 | parser.add_argument("--patience", default=40, type=int, help="early stopping patience")
32 | parser.add_argument("--lr", default=0.001, type=float, help="learning rate")
33 | parser.add_argument("--dropout", default=0.5, type=float, help="dropout rate")
34 | parser.add_argument("--batch-size", default=1024, type=int, help="size of batches")
35 | parser.add_argument("--load-file", help="filename to load a pretrained model.")
36 | parser.add_argument("--test", action="store_true", help="only do evaluation")
37 | parser.add_argument("--train-percent", default=1.0, type=float, help="percentage of data used for training")
38 | parser.add_argument("--seed", default=42, type=int)
39 | # fmt: on
40 |
--------------------------------------------------------------------------------
/varclr/utils/similarity_search.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import defaultdict
3 |
4 | import torch
5 |
6 | from varclr.utils.infer import MockArgs
7 | from varclr.data.preprocessor import CodePreprocessor
8 |
9 | if __name__ == "__main__":
10 | ret = torch.load(sys.argv[2])
11 | vars, embs = ret["vars"], ret["embs"]
12 | embs /= embs.norm(dim=1, keepdim=True)
13 | embs = embs.cuda()
14 | var2idx = dict([(var, idx) for idx, var in enumerate(vars)])
15 | processor = CodePreprocessor(MockArgs())
16 | Ks = [1, 5, 10, 25, 50, 100, 250, 500, 1000]
17 | topk_succ = defaultdict(int)
18 | tot = 0
19 | with open(sys.argv[1], "r") as f:
20 | for line in f:
21 | try:
22 | var1, var2 = line.strip().split()
23 | except ValueError:
24 | print("skpped: ", line)
25 |
26 | def canon(var):
27 | return "".join(
28 | [
29 | word.capitalize() if idx > 0 else word
30 | for idx, word in enumerate(processor(var).split())
31 | ]
32 | )
33 |
34 | var1, var2 = canon(var1), canon(var2)
35 | if var1 not in var2idx or var2 not in var2idx:
36 | print(f"variable {var1} or {var2} not found")
37 | continue
38 | tot += 1
39 | for k in Ks:
40 | result = torch.topk(embs @ embs[var2idx[var1]], k=k + 1)
41 | topk_succ[k] += var2 in [vars[idx] for idx in result.indices][1:]
42 |
43 | print(f"Total {tot} variable pairs")
44 | for k in Ks:
45 | print(f"Recall@{k} = {100 * topk_succ[k] / tot:.1f}")
46 |
--------------------------------------------------------------------------------