├── LICENSE
├── README.md
├── configs
    ├── data_configs
    │   ├── data-config-16k.json
    │   ├── data-config-4k.json
    │   ├── data-config-max250k.json
    │   └── data-config1.json
    ├── eval_configs
    │   └── io-config-1.json
    ├── infer_configs
    │   ├── infer-config-16k-ch5.json
    │   ├── infer-config-250k-ch5.json
    │   ├── infer-config-4k-ch5.json
    │   ├── infer-config-base-ch10.json
    │   ├── infer-config-base-ch5.json
    │   ├── infer-config-big-ch5.json
    │   ├── infer-config-med-ch5.json
    │   ├── infer-config-small-ch5.json
    │   ├── infer-config-wd-ch5.json
    │   ├── infer-config1.json
    │   ├── infer-config2.json
    │   └── infer-config3.json
    └── train_configs
    │   ├── train-config-16k.sh
    │   ├── train-config-250k.sh
    │   ├── train-config-4k.sh
    │   ├── train-config-base.sh
    │   ├── train-config-big.sh
    │   ├── train-config-med.sh
    │   ├── train-config-small-massivewd.sh
    │   ├── train-config-small.sh
    │   ├── train-config-wd.sh
    │   └── train-config1.sh
├── def_results_logs
    ├── eval_big.log
    ├── eval_big_plus.log
    ├── eval_big_vocab.log
    ├── eval_half_data.log
    ├── eval_med.log
    ├── eval_more_epochs.log
    ├── eval_more_weight_decay.log
    ├── eval_small.log
    └── eval_small_vocab.log
├── evaluate-bleu.py
├── evaluate-io-legacy.py
├── evaluate-syntax.py
├── experiments
    └── .keep
├── infer.py
├── job-gen.py
├── jobs
    └── .keep
├── neural_compilers
    ├── eval
    │   └── evaluator.py
    ├── train
    │   └── data_generator.py
    └── utils
    │   ├── config.py
    │   ├── constants.py
    │   ├── tokenization.py
    │   └── utilities.py
├── prepare-train-data.py
├── requirements.txt
├── scripts
    ├── corr.py
    ├── create-infer-jobs.sh
    ├── extract-gen-eval-syntax.sh
    ├── extract-gen.sh
    ├── format-results-each.sh
    ├── freq-errors.sh
    ├── gen.sh
    ├── generate-slurm-eval-16k-chkpt5.sh
    ├── generate-slurm-eval-250k-chkpt5.sh
    ├── generate-slurm-eval-4k-chkpt5.sh
    ├── generate-slurm-eval-base-chpt10.sh
    ├── generate-slurm-eval-base-chpt5.sh
    ├── generate-slurm-eval-big-chkpt5.sh
    ├── generate-slurm-eval-med-chkpt5.sh
    ├── generate-slurm-eval-small-chkpt5.sh
    ├── generate-slurm-eval-wd-chkpt5.sh
    ├── generate-slurm-eval.sh
    ├── generate-slurm.sh
    ├── get-anghabench-data.sh
    ├── get-synthesis-benchmark.sh
    ├── intersections.sh
    └── train
    │   ├── train-baseline-supervised-slurm.sh
    │   └── train-baseline-supervised.sh
├── setup.sh
└── slurm_logs
    └── .keep


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jordi Armengol Estapé
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Compilers
 2 | 
 3 | Code for building a neural compiler (a Transformer that learns to translate C to x86), from "Learning C to x86 Translation: An Experiment in Neural Compilation" (https://arxiv.org/abs/2108.07639).
 4 | 
 5 | **Warning: The code is not properly documented and organized since this is an ongoing effort.**
 6 | 
 7 | ## Steps to run
 8 | 
 9 | 1 - Create anbd activate virtual environment, install dependencies and get data:
10 | 
11 | 
12 |       bash setup.sh
13 | 
14 | 
15 | 2 - Prepare training data:
16 |       
17 |       python prepare-train-data.py --config-file configs/data_configs/data-config1.json
18 | 
19 | 3 - Train model (with Fairseq)
20 |       
21 |       DATA_PATH="data/YOUR_DATA" # Path from data generated in step 2
22 |       RUN="$DATA_PATH/YOUR_MODEL"
23 | 
24 |       mkdir -p $RUN
25 | 
26 |       fairseq-train \
27 |           $DATA_PATH \
28 |           --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
29 |           --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
30 |           --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
31 |           --weight-decay 0.0001 \
32 |           --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
33 |           --max-tokens 4096 \
34 |           --eval-bleu \
35 |           --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
36 |           --eval-bleu-remove-bpe \
37 |           --eval-bleu-print-samples \
38 |           --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
39 |           --save-dir $RUN/checkpoints -s c -t s \
40 |           --update-freq 4
41 | 
42 | 4 - Run inference on I/O benchmark
43 | 
44 |       python infer.py --config-file YOUR_CONFIG.json  # See configs/infer_configs/infer-config-4k-ch5.json
45 | 
46 | 5 - Evaluate on I/O examples:
47 | 
48 |       
49 |       python evaluate-io-legacy.py --synthesis-eval-path synthesis-eval \
50 |                                    --predictions-path PATH_TO_PREDICTIONS # Predictions generated in step 4
51 | 
52 | 
53 | ## How to cite
54 | If you use any of these resources (datasets or models) in your work, please cite our latest paper:
55 | ```bibtex
56 | @misc{armengolestape2021learning,
57 |       title={Learning C to x86 Translation: An Experiment in Neural Compilation}, 
58 |       author={Jordi Armengol-Estap\'e and Michael F. P. O'Boyle},
59 |       year={2021},
60 |       eprint={2108.07639},
61 |       archivePrefix={arXiv},
62 |       primaryClass={cs.AI}
63 | }
64 | ```
65 | 
66 | 


--------------------------------------------------------------------------------
/configs/data_configs/data-config-16k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tokenizer_config": {
 3 |     "tokenizer_type": "pygments",
 4 |     "subword_tokenizer": "subword-nmt",
 5 |     "subword_vocab_size":  16384,
 6 |     "shared_vocab": true
 7 |   },
 8 |   "input_path": "AnghaBench",
 9 |   "output_path": "experiments",
10 |   "min_tokens": 16,
11 |   "max_tokens": 314,
12 |   "supervised": true,
13 |   "valid_test_size": 1000,
14 |   "seed": 42,
15 |   "just_func": true
16 | }


--------------------------------------------------------------------------------
/configs/data_configs/data-config-4k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tokenizer_config": {
 3 |     "tokenizer_type": "pygments",
 4 |     "subword_tokenizer": "subword-nmt",
 5 |     "subword_vocab_size":  4096,
 6 |     "shared_vocab": true
 7 |   },
 8 |   "input_path": "AnghaBench",
 9 |   "output_path": "experiments",
10 |   "min_tokens": 16,
11 |   "max_tokens": 314,
12 |   "supervised": true,
13 |   "valid_test_size": 1000,
14 |   "seed": 42,
15 |   "just_func": true
16 | }


--------------------------------------------------------------------------------
/configs/data_configs/data-config-max250k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tokenizer_config": {
 3 |     "tokenizer_type": "pygments",
 4 |     "subword_tokenizer": "subword-nmt",
 5 |     "subword_vocab_size":  8192,
 6 |     "shared_vocab": true
 7 |   },
 8 |   "input_path": "AnghaBench",
 9 |   "output_path": "output",
10 |   "min_tokens": 16,
11 |   "max_tokens": 314,
12 |   "supervised": true,
13 |   "valid_test_size": 1000,
14 |   "seed": 42,
15 |   "just_func": true,
16 |   "max_train_data": 250000
17 | }


--------------------------------------------------------------------------------
/configs/data_configs/data-config1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tokenizer_config": {
 3 |     "tokenizer_type": "pygments",
 4 |     "subword_tokenizer": "subword-nmt",
 5 |     "subword_vocab_size":  8192,
 6 |     "shared_vocab": true
 7 |   },
 8 |   "input_path": "AnghaBench",
 9 |   "output_path": "output",
10 |   "min_tokens": 16,
11 |   "max_tokens": 314,
12 |   "supervised": true,
13 |   "valid_test_size": 1000,
14 |   "seed": 42,
15 |   "just_func": true
16 | }


--------------------------------------------------------------------------------
/configs/eval_configs/io-config-1.json:
--------------------------------------------------------------------------------
1 | {"references": "synthesis-eval/examples",
2 |   "implementations": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-21-2007-1002-f052/examples"
3 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-16k-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path":"model_16k/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-250k-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path":"model250k/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-4k-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path":"model_4k/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-base-ch10.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "checkpoint10.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-base-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-big-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "/home/usuaris/veu/jordi.armengol/phd-preliminary/runs/big-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-med-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "/home/usuaris/veu/jordi.armengol/phd-preliminary/runs/med-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-small-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "/home/usuaris/veu/jordi.armengol/phd-preliminary/runs/small-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config-wd-ch5.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false,
8 |   "checkpoint_path": "/home/usuaris/veu/jordi.armengol/phd-preliminary/runs/wd0.01-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
9 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "",
3 |   "model_path": "",
4 |   "beam": 1,
5 |   "top_n": 1,
6 |   "add_header_footer": true,
7 |   "replace_true_false": true
8 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config2.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "examples_",
3 |   "model_path": "baseline4",
4 |   "beam": 1,
5 |   "top_n": 1,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false
8 | }


--------------------------------------------------------------------------------
/configs/infer_configs/infer-config3.json:
--------------------------------------------------------------------------------
1 | {
2 |   "c_path": "synthesis-eval/examples",
3 |   "model_path": "experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f",
4 |   "beam": 5,
5 |   "top_n": 5,
6 |   "add_header_footer": false,
7 |   "replace_true_false": false
8 | }


--------------------------------------------------------------------------------
/configs/train_configs/train-config-16k.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad"
 4 | RUN="experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad/model_16k"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s \
18 |     --update-freq 4
19 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-250k.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294"
 4 | RUN="output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294/model250k"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s \
18 |     --update-freq 4
19 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-4k.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039"
 4 | RUN="experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039/model_4k"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s \
18 |     --update-freq 4
19 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/baseline4-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --eval-bleu \
17 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
18 |     --eval-bleu-detok moses \
19 |     --eval-bleu-remove-bpe \
20 |     --eval-bleu-print-samples \
21 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
22 |     --save-dir $RUN/checkpoints -s c -t s \
23 |     --update-freq 4
24 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-big.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/big-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 1024 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s \
18 |     --update-freq 16 --encoder-layers 8 --decoder-layers 8
19 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-med.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/med-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s \
18 |     --update-freq 4 --encoder-ffn-embed-dim 2048 --encoder-attention-heads 8 --decoder-ffn-embed-dim 2048 \
19 |     --decoder-attention-heads 8
20 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-small-massivewd.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/smallmassivewd-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
13 |     --dropout 0.3 --weight-decay 1.0 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s
18 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-small.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/small-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
13 |     --dropout 0.3 --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --tensorboard-logdir $RUN/tb \
17 |     --save-dir $RUN/checkpoints -s c -t s
18 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config-wd.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 4 | RUN="runs/wd0.01-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 5 | 
 6 | mkdir -p $RUN
 7 | 
 8 | fairseq-train \
 9 |     $DATA_PATH \
10 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 0.001 --lr-scheduler inverse_sqrt --min-lr '1e-09' --warmup-updates 4000 --warmup-init-lr '1e-07'\
13 |     --weight-decay 0.01 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 4096 \
16 |     --eval-bleu \
17 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
18 |     --eval-bleu-detok moses \
19 |     --eval-bleu-remove-bpe \
20 |     --eval-bleu-print-samples \
21 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
22 |     --save-dir $RUN/checkpoints -s c -t s \
23 |     --update-freq 4
24 | 


--------------------------------------------------------------------------------
/configs/train_configs/train-config1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_PATH="data/AnghaBench-supervised-2021-04-12-1045-1ad3533bfc9df19cc9bff3ba72487765-71fb-f770"
 4 | 
 5 | SELF=$(basename $BASH_SOURCE)
 6 | DATE=$(date "+%d-%m-%Y_%H-%M-%S")
 7 | RUN="$DATA_PATH/$SELF-$DATE"
 8 | 
 9 | mkdir -p $RUN
10 | 
11 | fairseq-train \
12 |     $DATA_PATH \
13 |     --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
14 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
15 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
16 |     --dropout 0.3 --weight-decay 0.0001 \
17 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
18 |     --max-tokens 4096 \
19 |     --eval-bleu \
20 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
21 |     --eval-bleu-detok moses \
22 |     --eval-bleu-remove-bpe \
23 |     --eval-bleu-print-samples \
24 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
25 |     --save-dir $RUN/checkpoints >> $RUN/train.log


--------------------------------------------------------------------------------
/evaluate-bleu.py:
--------------------------------------------------------------------------------
 1 | from bleu import list_bleu
 2 | import os
 3 | from neural_compilers.utils.tokenization import PygmentsTokenizer
 4 | from pathlib import Path
 5 | import json
 6 | from copy import deepcopy
 7 | from neural_compilers.eval.evaluator import save_eval
 8 | 
 9 | code_tokenizer = PygmentsTokenizer()
10 | 
11 | 
12 | def eval_bleu(ref: str, hyp: str) -> float:
13 |     return list_bleu(ref, hyp, detok=False)
14 | 
15 | 
16 | def eval_line_by_line_file(ref_path: str, out_path: str):
17 |     score = 0.0
18 |     count = 0
19 |     with open(ref_path, 'r') as refs, open(out_path, 'r') as hyps:
20 |         for ref, hyp in zip(refs.readlines(), hyps.readlines()):
21 |             count += 1
22 |             ref_tok = ' '.join(code_tokenizer.tokenize(programs=ref, lang='asm'))
23 |             hyp_tok = ' '.join(code_tokenizer.tokenize(programs=hyp, lang='asm'))
24 |             score += eval_bleu(ref_tok, hyp_tok)
25 |     return score/count
26 | 
27 | 
28 | def eval_dir(ref_path: str, out_path: str):
29 |     score = 0.0
30 |     count = 0
31 |     asm_files_ref = sorted(Path(ref_path).rglob('*.s'))
32 |     asm_files_out = sorted(Path(out_path).rglob('*.s'))
33 |     for asm_file_ref, asm_file_out in zip(asm_files_ref, asm_files_out):
34 |         with open(asm_file_ref, 'r') as ref, open(asm_file_out, 'r') as hyp:
35 |             ref = ref.read()
36 |             hyp = hyp.read()
37 |             count += 1
38 |             ref_tok = ' '.join(code_tokenizer.tokenize(programs=ref, lang='asm'))
39 |             hyp_tok = ' '.join(code_tokenizer.tokenize(programs=hyp, lang='asm'))
40 |             score += eval_bleu(ref_tok, hyp_tok)
41 |     return score/count
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     import argparse
46 |     parser = argparse.ArgumentParser('BLEU evaluator')
47 |     parser.add_argument('--ref-path', help="Reference C directory or file. If it's a file path, then one function per"
48 |                                            "line. Otherwise, one function per file.")
49 |     parser.add_argument('--out-path', help="System output ASM directory or file. If it's a file path, then one function"
50 |                                            "per line. Otherwise, one function per file.")
51 |     parser.add_argument('--config-file', type=str, help='Path to JSON file (instead of command line arguments)')
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     orig_args = deepcopy(args)
56 | 
57 |     if args.config_file:
58 |         with open(args.config_file, 'r') as f:
59 |             config = json.load(f)
60 |         args = argparse.Namespace(**config)
61 |     else:
62 |         config = vars(args)
63 | 
64 |     if os.path.isdir(args.ref_path):
65 |         res = eval_dir(config['ref_path'], config['out_path'])
66 |     else:
67 |         res = eval_line_by_line_file(config['ref_path'], config['out_path'])
68 |     save_eval(system_output_path=config['out_path'], eval_script=__file__, config=config, results=dict(bleu=res))
69 |     print(res)
70 | 


--------------------------------------------------------------------------------
/evaluate-io-legacy.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass, asdict
  3 | from neural_compilers.utils.tokenization import GCC
  4 | from typing import *
  5 | import logging
  6 | import time
  7 | import uuid
  8 | import git
  9 | from bleu import list_bleu
 10 | from neural_compilers.utils.tokenization import PygmentsTokenizer
 11 | import lizard
 12 | import re, itertools
 13 | from copy import deepcopy
 14 | import sys
 15 | from io import StringIO
 16 | import contextlib
 17 | import json
 18 | 
 19 | 
 20 | def print(*args):
 21 |     logging.info(' '.join([str(arg) for arg in args]))
 22 | 
 23 | 
 24 | code_tokenizer = PygmentsTokenizer()
 25 | 
 26 | 
 27 | def eval_bleu(ref: str, hyp: str) -> float:
 28 |     return list_bleu([ref], [hyp], detok=False)
 29 | 
 30 | JUST_FUNC = True
 31 | 
 32 | BAD_CASES = []
 33 | 
 34 | BAD_EXAMPLES = {}
 35 | 
 36 | @dataclass
 37 | class FuncParameter:
 38 |     type_name: str
 39 |     var_name: str
 40 | 
 41 | 
 42 | @dataclass
 43 | class Signature:
 44 |     return_type: str
 45 |     func_name: str
 46 |     parameters: List[FuncParameter]
 47 | 
 48 | 
 49 | @dataclass
 50 | class Example:
 51 |     inp: List[str]
 52 |     out: str
 53 | 
 54 | @dataclass
 55 | class ExampleList:
 56 |     signature: Signature
 57 |     examples: List[Example]
 58 | 
 59 | @dataclass
 60 | class Array:
 61 |     var_name: str
 62 |     size: int
 63 | 
 64 | @dataclass
 65 | class Props:
 66 |     output: List[str]
 67 |     arrays: List[Array]
 68 | 
 69 | 
 70 | # From IO-EMBEDDINGS repo
 71 | def parse_file(path: str) -> Tuple[Signature, Example]:
 72 |     with open(path, 'r') as f:
 73 |         lines = f.readlines()
 74 |     # added hack for avoiding comments, macros, empty lines. TODO: review, improve
 75 |     lines = [line for line in lines if not line.startswith('//') and not line.startswith('#') and len(line.split()) > 0]
 76 |     signature = lines[0]
 77 |     signature_split = signature.split()
 78 |     return_type = signature_split[0]
 79 |     func_name = signature_split[1].split('(')[0]
 80 |     parameters = signature[signature.find('(') + 1:signature.find(')')]
 81 |     parsed_parameters = []
 82 |     for parameter in parameters.split(','):
 83 |         pointer = False
 84 |         if parameter.count('**') > 1:
 85 |             raise RuntimeError(parameter)
 86 |         if '*' in parameter:
 87 |             parameter = parameter.replace('*', '')
 88 |             pointer = True
 89 |         parameter = ' '.join(parameter.split())
 90 |         param_type, param_name = parameter.split()
 91 |         if pointer:
 92 |             param_type += '*'
 93 |         parsed_parameters.append(FuncParameter(type_name=param_type, var_name=param_name))
 94 |     parsed_signature = Signature(return_type=return_type, func_name=func_name, parameters=parsed_parameters)
 95 |     parsed_example = None
 96 |     return parsed_signature, parsed_example
 97 | 
 98 | 
 99 | def get_single_scanf(parameter: FuncParameter, declare: bool = True) -> str:
100 |     scanf = []
101 |     if parameter.type_name in ['int', 'bool']:
102 |         if declare:
103 |             scanf.append(f'  int {parameter.var_name};')
104 |         scanf.append(f'  scanf("%d", &{parameter.var_name});')
105 |     elif parameter.type_name == 'float':
106 |         if declare:
107 |             scanf.append(f'  float {parameter.var_name};')
108 |         scanf.append(f'  scanf("%f", &{parameter.var_name});')
109 |     elif parameter.type_name == 'char':
110 |         if declare:
111 |             scanf.append(f'  char {parameter.var_name};')
112 |         scanf.append(f'  scanf("%c", &{parameter.var_name});')
113 |     else:
114 |         raise NotImplementedError(parameter.type_name)
115 |     return '\n'.join(scanf)
116 | 
117 | def infer_size_from_code_or_examples(func_code: str, parameter: FuncParameter, examples) -> str:
118 |     for line in func_code.splitlines():
119 |         if 'for' in line:
120 |             if parameter.var_name in line:
121 |                 before, _, after = line.partition('<')
122 |                 return after.split()[0].replace(';', '')
123 |     raise RuntimeError('Cannot infer size from code')
124 | 
125 | 
126 | # reverse_scalars: in simpl, scalars seem to be in reverse order
127 | def get_scanf(signature: Signature, props: Props, func_code: str, reverse_scalars: bool = True) -> str:
128 |     # hack to have n before arrays of size n
129 |     scalar_scanfs = []
130 |     array_scanfs = []
131 |     for parameter in signature.parameters:
132 | 
133 |         if parameter.type_name.count('*') > 1:
134 |             raise NotImplementedError(parameter.type_name)
135 |         elif parameter.type_name.count('*') == 0:
136 |             scalar_scanfs.append(get_single_scanf(parameter))
137 |         else:  # == 1
138 |             size = None
139 |             for array in props.arrays:
140 |                 if array.var_name == parameter.var_name:
141 |                     size = array.size
142 |                     break
143 |             scalar_type = parameter.type_name.replace('*', '')
144 |             element = FuncParameter(type_name=scalar_type, var_name=parameter.var_name+'[idx]')
145 |             single_scanf = get_single_scanf(element, declare=False)
146 |             array_scanfs.append(f'  {parameter.type_name} {parameter.var_name};')
147 |             array_scanfs.append(f"  {parameter.var_name} = ({parameter.type_name}) malloc({size}*sizeof({parameter.type_name.replace('*','')}));")
148 |             array_scanfs.append('  int idx;')
149 |             array_scanfs.append(f'  for (idx = 0; idx < {size}; idx++) ' + '{')
150 |             array_scanfs.append('  ' + single_scanf)
151 |             array_scanfs.append('  }')
152 | 
153 |     if len(scalar_scanfs) > 1 and reverse_scalars:
154 |         scalar_scanfs.reverse()
155 | 
156 |     scanf = scalar_scanfs + array_scanfs
157 | 
158 |     return '\n'.join(scanf) + '\n'
159 | 
160 | 
161 | def get_function_call(signature: Signature) -> str:
162 |     res = ''
163 |     if signature.return_type != 'void':
164 |         res = f'  {signature.return_type} res;\n'
165 |         res += '  res = '
166 |     res += signature.func_name + '(' + ' '.join([par.var_name + ',' for par in signature.parameters])
167 |     if len(signature.parameters) > 0:
168 |         res = res[:-1]
169 |     res += ');\n'
170 |     return res
171 | 
172 | 
173 | def get_single_printf(type_: str, var_name: str, trailing_space: bool = False) -> str:
174 |     space = ' ' if trailing_space else ''
175 |     if type_ in ['int', 'bool']:  # TODO: check bool
176 |         return f'  printf("{space}%d", {var_name});'
177 |     elif type_ == 'float':
178 |         return f'  printf("{space}%f", {var_name});'
179 |     elif type_ == 'char':
180 |         return f'  printf({space}"%c", {var_name});'
181 |     else:
182 |         raise NotImplementedError(type_)
183 | 
184 | 
185 | def print_newline() -> str:
186 |     return '  printf("\\n");'
187 | 
188 | 
189 | def get_printf(signature: Signature, props: Props) -> str:
190 |     props = deepcopy(props)
191 |     if signature.return_type == 'void':
192 |         done_output = set([])
193 |         printf = []
194 |         for i in range(len(props.output)):
195 |             result_type = None
196 |             var_name = None
197 |             for parameter in signature.parameters:
198 |                 # if parameter.var_name == props.output:
199 |                 if parameter.var_name in props.output and parameter.var_name not in done_output:
200 |                     result_type = parameter.type_name.replace('*', '')
201 |                     var_name = parameter.var_name
202 |                     break
203 |             size = None
204 |             for array in props.arrays:
205 |                 # if array.var_name == props.output:
206 |                 if array.var_name in props.output and array.var_name not in done_output:
207 |                     size = array.size
208 |                     assert var_name == array.var_name
209 |                     break
210 |             if size is None:
211 |                 pass
212 |             done_output.add(var_name)
213 | 
214 |             printf.append('  int idx;')
215 |             printf.append('  int first = 0;')
216 |             printf.append(f'  for (idx = 0; idx < {size}; idx++) ' + '{')
217 |             printf.append('    if (first) {')
218 |             printf.append('      ' + get_single_printf(result_type, var_name=var_name+'[idx]'))
219 |             printf.append('        first = 0;')
220 |             printf.append('    }')
221 |             printf.append('    else {')
222 |             printf.append('     ' + get_single_printf(result_type, var_name=var_name+'[idx]', trailing_space=True))
223 |             printf.append('    }')
224 |             printf.append('  }')
225 |         printf = '\n'.join(printf) + '\n' + print_newline() + '\n'
226 | 
227 |     else:
228 |         printf = get_single_printf(signature.return_type, var_name='res') + '\n' + print_newline() + '\n'
229 |     return printf
230 | 
231 | def parse_props(props_str: str) -> Props:#, signature: Signature):
232 |     # signature could be parsed from parse but we already have it
233 |     if props_str.startswith('void') or props_str.count('output') > 0:
234 |         result = []
235 |         for l in props_str.splitlines():
236 |             if l.startswith('output'):
237 |                 result.append(l.split()[1])
238 |         if len(result) == 0:
239 |             print('WARNING: Props output not found, using the only array instead')
240 |             for l in props_str.splitlines()[1:]:
241 |                 _, var_name, size = l.split()
242 |                 var_name = var_name[:-1]
243 |                 result = [var_name]
244 |                 break
245 |     else:
246 |         result = ['res']
247 |     arrays = []
248 |     for l in props_str.splitlines()[1:]:
249 |         if l.startswith('output'):
250 |             continue
251 |         _, var_name, size = l.split()
252 |         var_name = var_name[:-1]
253 |         array = Array(var_name=var_name, size=size)
254 |         arrays.append(array)
255 | 
256 |     props = Props(output=result, arrays=arrays)
257 |     return props
258 | 
259 | 
260 | def contains_array(code: str) -> bool:
261 |     for s in ['int*', 'char*', 'float*', 'int *', 'char *', 'float *']:
262 |         if code.count(s) > 0:
263 |             return True
264 |     return False
265 | 
266 | def signature2standalone(signature: Signature, function_code: str, props: str, examples) -> Tuple[str,str]:
267 |     ##### c_imp
268 |     # props is only used if the return type is void, then we need to know which is the "result" of the function (side-effect)
269 |     malloc_lib = '#include <stdlib.h>' if contains_array(function_code) else ''
270 |     c_imp = f"#include <stdio.h>\n{malloc_lib}\n" + function_code + '\n'
271 |     c_imp += '#include <math.h>\n#include <stdbool.h>\n'
272 |     c_imp += 'int main() {\n'
273 |     parsed_props = parse_props(props)
274 |     scanf = get_scanf(signature, parsed_props, func_code=function_code)
275 |     # print(scanf)
276 |     c_imp += scanf
277 |     function_call = get_function_call(signature)
278 |     c_imp += '  ' + function_call
279 |     printf = get_printf(signature, props=parsed_props)
280 |     c_imp += printf
281 | 
282 |     c_imp += '\n  return 0;\n}\n'
283 | 
284 |     c_imp = c_imp.replace('None', 'n')
285 | 
286 |     # Force single declaration of idx
287 |     before_first_idx, first_idx, after_first_idx = c_imp.partition('int idx;')
288 |     c_imp = before_first_idx + first_idx + after_first_idx.replace('int idx;', '')
289 |     #print(c_imp)
290 | 
291 |     def get_function_signature_string(f):
292 |         s = ''
293 |         for line in f.splitlines():
294 |             if line.startswith('#'):
295 |                 continue
296 |             if len(line.split()) == 0:
297 |                 continue
298 |             else:
299 |                 s = line.strip().replace('{', '')
300 |                 break
301 |         return s
302 | 
303 | 
304 | 
305 | 
306 |     #### main_code (external)
307 |     # props is only used if the return type is void, then we need to know which is the "result" of the function (side-effect)
308 |     malloc_lib = '#include <stdlib.h>' if contains_array(function_code) else ''
309 |     main_code = f"#include <stdio.h>\n{malloc_lib}\n"
310 |     main_code += '#include <math.h>\n#include <stdbool.h>\n'
311 |     main_code += f'extern {get_function_signature_string(function_code)};\n'
312 |     main_code += 'int main() {\n'
313 |     props = parse_props(props)
314 |     scanf = get_scanf(signature, props, func_code=function_code)
315 | 
316 |     main_code += scanf
317 |     function_call = get_function_call(signature)
318 |     main_code += '  ' + function_call
319 |     printf = get_printf(signature, props=props)
320 |     main_code += printf
321 | 
322 |     main_code += '\n  return 0;\n}\n'
323 | 
324 |     # Force single declaration of idx
325 |     before_first_idx, first_idx, after_first_idx = main_code.partition('int idx;')
326 |     main_code = before_first_idx + first_idx + after_first_idx.replace('int idx;', '')
327 | 
328 |     main_code = main_code.replace('None', 'n')
329 | 
330 |     return c_imp, main_code
331 | 
332 | 
333 | @contextlib.contextmanager
334 | def stdoutIO(stdout=None):
335 |     old = sys.stdout
336 |     if stdout is None:
337 |         stdout = StringIO()
338 |     sys.stdout = stdout
339 |     yield stdout
340 |     sys.stdout = old
341 | 
342 | def run_python_script(name: str, path: str) -> str:
343 |     previous_dir = os.getcwd()
344 |     os.chdir(path)
345 | 
346 |     with stdoutIO() as s:
347 |         try:
348 |             exec(open(name).read())
349 |         except BaseException as e:
350 |             pass
351 |     os.chdir(previous_dir)
352 | 
353 | # 1 - min_so_far_subtracted etc have errors in the L2 files.
354 | # 2- integers must come first to guarantee that array sizes are initialized before arrays. however, in the examples these is not respected
355 | # so we reorder
356 | # so, scalars (in order) then arrays (in order)
357 | # assume only input is affected
358 | def get_examples(example_path: str, use_simpl_instead_of_L2: bool, scalars_first: bool, signature: Signature) -> List[Tuple[str, str]]:
359 | 
360 | 
361 |     if use_simpl_instead_of_L2:
362 |         with open(os.path.join(example_path, 'simpl'), 'r') as f:
363 |             simpl = f.read()
364 |             data = simpl2json(simpl, signature=signature)
365 | 
366 |             simpl_header = [l for l in simpl.split('\n') if "fun" in l][0]
367 |             diff_num_parameters = len(simpl_header.replace('fun', '').replace('->','').split()) - len(signature.parameters)
368 |             if diff_num_parameters != 0:
369 |                 # simpl includes length of output
370 |                 if diff_num_parameters == 1:
371 |                     import re
372 |                     for i in range(len((data['contents']['examples']))):
373 | 
374 |                         data['contents']['examples'][i] = re.sub(' -?\d+\)',')', data['contents']['examples'][i])
375 |                 else:
376 | 
377 |                     # simpl includes length of output
378 |                     if signature.return_type == 'void':
379 |                         import re
380 |                         for i in range(len((data['contents']['examples']))):
381 |                             data['contents']['examples'][i] = re.sub(' -?\d+\)', ')', data['contents']['examples'][i])
382 |                     # simpl includes  length for each array, even if according to the c implementation they are equal
383 |                     import re
384 |                     for i in range(len((data['contents']['examples']))):
385 |                         # remove extra Ns
386 | 
387 |                         c = itertools.count()
388 |                         data['contents']['examples'][i] = re.sub('\] -?\d+ \[', lambda x: x.group() if not next(c) else '] [', data['contents']['examples'][i])
389 |                         data['contents']['examples'][i] = re.sub(r"(\]) (-?\d+) (-?\d+) (-?\d+) (\[)", r"\1 \3 \5", data['contents']['examples'][i])
390 | 
391 | 
392 | 
393 |     else:
394 |         with open(os.path.join(example_path, 'L2'), 'r') as f:
395 |             data = json.load(f)
396 | 
397 |     parsed_examples = []
398 |     for example in data['contents']['examples']:
399 |         # "(f 1) -> 1", "(f 4) -> 36", "(f 4) -> 36", "(f 1) -> 1"
400 |         inp, _, out = example.partition('->')
401 |         inp = ' '.join(inp.strip()[2:-1].split())
402 |         out = ' '.join(out.split())
403 | 
404 |         def parse(text):
405 |             parsed = ''
406 |             i = 0
407 |             in_array = False
408 |             while i < len(text):
409 |                 current_char = text[i]
410 |                 if len(current_char.split()) == 0:
411 |                     if in_array:
412 |                         parsed += ' '
413 |                     else:
414 |                         parsed += '\n'
415 |                 elif current_char == '[':
416 |                     parsed += '\n'
417 |                     in_array = True
418 |                 elif current_char == ']':
419 |                     parsed += '\n'
420 |                     in_array = False
421 |                 else:
422 |                     parsed += current_char
423 |                 i += 1
424 |             return parsed
425 | 
426 |         def parse_scalars_first(text):
427 |             parsed_scalars = ''
428 |             parsed_arrays = ''
429 |             i = 0
430 |             in_array = False
431 |             while i < len(text):
432 |                 current_char = text[i]
433 |                 if len(current_char.split()) == 0:
434 |                     if in_array:
435 |                         parsed_arrays += ' '
436 |                     else:
437 |                         parsed_scalars += '\n'
438 |                 elif current_char == '[':
439 |                     parsed_arrays += '\n'
440 |                     in_array = True
441 |                 elif current_char == ']':
442 |                     parsed_arrays += '\n'
443 |                     in_array = False
444 |                 else:
445 |                     if in_array:
446 |                         parsed_arrays += current_char
447 |                     else:
448 |                         parsed_scalars += current_char
449 |                 i += 1
450 |             return parsed_scalars + parsed_arrays
451 | 
452 |         if scalars_first:
453 |             parsed_inp = parse_scalars_first(inp)
454 |         else:
455 |             parsed_inp = parse(inp)
456 |         parsed_out = parse(out)
457 |         parsed_examples.append((parsed_inp, parsed_out))
458 |     return parsed_examples
459 | 
460 | 
461 | 
462 | 
463 | def get_asm_header_footer_body(asm_path: str) -> Tuple[str, str, str]:
464 |     with open(asm_path, 'r') as f:
465 |         asm = f.readlines()
466 |     header = ''
467 |     for line in asm:
468 |         if ':' in line:
469 |             break
470 |         header += line
471 |     with open(asm_path, 'r') as f:
472 |         asm = f.read()
473 |     body, _, footer = asm.partition('.cfi_endproc')
474 |     body = body[len(header):]
475 |     return header, footer, body
476 | 
477 | 
478 | def simpl2json(simpl: str, signature: Signature) -> Dict:
479 |     '''
480 | 
481 |     Examples
482 | Examples
483 | {6,0,4,8,7,6,4,7,5,9,3,8,2,4},14,{2,1,9,4,8,9,2,4,1,1,10,5,7,8},14,{0,0,0,0,0,0,0,0,0,0,0,0,0,0},14 -> {-1,0,-8,-3,-7,-8,-1,-3,0,0,-9,-4,-6,-7};
484 | {5,6,5,9},4,{10,3,8,7},4,{0,0,0,0},4 -> {-9,-2,-7,-6};
485 | {8,4,0,8,0,1,6,10,10,0,9,7,5,3,5,1},16,{3,9,3,3,2,8,7,1,1,5,8,7,1,4,8,4},16,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},16 -> {-2,-8,-2,-2,-1,-7,-6,0,0,-4,-7,-6,0,-3,-7,-3};
486 | {8,5,8,3},4,{9,8,9,4},4,{0,0,0,0},4 -> {-8,-7,-8,-3};
487 | {1,9,6,5,9,3,4,2,3,2,0,9,10,4,7,1},16,{1,10,2,2,0,1,8,10,6,8,4,8,3,3,10,9},16,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},16 -> {0,-9,-1,-1,0,0,-7,-9,-5,-7,-3,-7,-2,-2,-9,-8};
488 | {9,4,7,7,10,10,5,1,5,9,1,7,9,10},14,{5,3,3,0,4,1,3,5,2,5,6,0,1,2},14,{0,0,0,0,0,0,0,0,0,0,0,0,0,0},14 -> {-4,-2,-2,0,-3,0,-2,-4,-1,-4,-5,0,0,-1};
489 | {0,9,10,8,9,10,1,0},8,{1,10,3,9,9,1,6,1},8,{0,0,0,0,0,0,0,0},8 -> {0,-9,-2,-8,-8,0,-5,0};
490 | {1,0,9,0,3,2,1,7,3,0,10,0},12,{8,6,9,1,4,1,3,1,10,4,5,6},12,{0,0,0,0,0,0,0,0,0,0,0,0},12 -> {-7,-5,-8,0,-3,0,-2,0,-9,-3,-4,-5};
491 | {0,8,7,0,9,1},6,{6,3,4,5,7,9},6,{0,0,0,0,0,0},6 -> {-5,-2,-3,-4,-6,-8};
492 | 
493 | ["(f [] []) -> []", "(f [6 0 4 8 7 6 4 7 5 9 3 8 2 4] [2 1 9 4 8 9 2 4 1 1 10 5 7 8]) -> [-1 0 -8 -3 -7 -8 -1 -3 0 0 -9 -4 -6 -7]",
494 | 
495 |     '''
496 |     L2_examples = []
497 |     lines = simpl.splitlines()[1:]
498 |     simpl_header = [l for l in simpl.split('\n') if "fun" in l][0]
499 |     for line in lines:
500 |         if len(line.split()) == 0:
501 |             break
502 |         L2 = line.strip().replace('{', '[').replace('}', ']').replace(',', ' ').replace(';', '')
503 |         L2 = '(f ' + L2.replace(' ->', ') ->')
504 |         L2_examples.append(L2)
505 |         # hack to have n before arrays of size n!
506 |         diff_num_parameters = len(simpl_header.replace('fun', '').replace('->','').split()) - len(signature.parameters)
507 |         if diff_num_parameters != 0:
508 |             # simpl includes length of output
509 |             pass
510 |     return {'contents': {'examples': L2_examples}}
511 | 
512 | 
513 | 
514 | def run_io(c_code: str, example_path: str, just_func_code: str, main_code: str, signature: Signature, predictions_path: str,  use_simpl_instead_of_L2: bool) -> Tuple[bool, bool, bool, float]:
515 |     try:
516 |         examples = get_examples(example_path, use_simpl_instead_of_L2=use_simpl_instead_of_L2, scalars_first=True, signature=signature)
517 |     except FileNotFoundError as e:
518 |         return False, False, False, 0.0  # benchmark ok, model ok, syntax model ok, BLEU model
519 | 
520 |     func_name = example_path.split(os.sep)[-1]
521 |     dir_ = os.path.join(predictions_path, func_name)
522 | 
523 |     # Run with gcc:sc
524 |     from neural_compilers.utils.utilities import get_tmp_file, run_command
525 |     tmp_c = get_tmp_file(c_code, extension='.c', dir=dir_)
526 |     output = tmp_c[:-2] + '.x'
527 |     stdout, stderr = run_command(f'gcc -O0 -x c -o {output} {tmp_c}')
528 |     print(stderr)
529 |     gcc_corrects = 0
530 |     bad_examples_in_benchmark = 0
531 |     for idx, (in_, out_) in enumerate(examples):
532 |         if os.path.basename(example_path) in BAD_EXAMPLES and idx in BAD_EXAMPLES[os.path.basename(example_path)]:
533 |             bad_examples_in_benchmark += 1
534 |             continue
535 |         # stdout, stderr = run_command(f'./{output} {tmp_c}', stdin=
536 |         prefix_ex = './' if not os.path.isabs(output) else ''
537 |         try:
538 |             stdout, stderr = run_command(f'{prefix_ex}{output}', stdin=in_)
539 |         except FileNotFoundError as e:
540 |             return False, False, False, 0.0
541 |         if stdout.strip() == out_.strip():
542 |             gcc_corrects += 1
543 | 
544 |     print(example_path, 'GCC: ', f'{gcc_corrects}/{len(examples) - bad_examples_in_benchmark}')
545 |     print()
546 |     os.remove(tmp_c)
547 |     os.remove(output)
548 | 
549 |     # first compile with gcc to get header and footer
550 |     tmp_c = get_tmp_file(just_func_code, extension='.c', dir=dir_)
551 |     output = tmp_c[:-2] + '.s'
552 |     stdout, stderr = run_command(f'gcc -O0 -c -S -o {output} {tmp_c}')
553 |     asm_header, asm_footer, asm_body = get_asm_header_footer_body(output)
554 |     func_name = example_path.split(os.sep)[-1]
555 |     import glob
556 |     max_model_corrects = 0
557 |     best_bleu = 0.0
558 |     best_syntax = False
559 |     for idx, hypothesis in reversed(list(enumerate(sorted(glob.glob(os.path.join(predictions_path, func_name, f'{func_name}*.s')))))):
560 |         print('Hypothesis:', idx+1)
561 |         print(hypothesis)
562 |         hypothesis_ = open(hypothesis).read()
563 |         model_assembly = asm_header + hypothesis_ + asm_footer
564 |         tmp_s = get_tmp_file(model_assembly, extension='.s', dir=dir_)
565 |         output = tmp_c[:-2] + '.x'
566 |         main_ = get_tmp_file(main_code, extension='.c', dir=dir_)
567 |         stdout, stderr = run_command(f'gcc -O0 -o {output} {main_} {tmp_s}')
568 | 
569 | 
570 |         model_corrects = 0
571 |         bad_examples_in_benchmark = 0
572 |         not_compiled = False
573 |         for idx, (in_, out_) in enumerate(examples):
574 |             if os.path.basename(example_path) in BAD_EXAMPLES and idx in BAD_EXAMPLES[os.path.basename(example_path)]:
575 |                 bad_examples_in_benchmark += 1
576 |                 continue
577 |             try:
578 |                 prefix_ex = './' if not os.path.isabs(output) else ''
579 |                 stdout, stderr = run_command(f'{prefix_ex}{output}', stdin=in_, timeout=5)
580 |                 if stdout.strip() == out_:
581 |                     model_corrects += 1
582 |             except BaseException as e:
583 |                 if isinstance(e, TimeoutError):
584 |                     break
585 |                 not_compiled = True
586 |                 break
587 |         ref_tok = ' '.join(code_tokenizer.tokenize(programs=asm_body, lang='asm'))
588 |         hyp_tok = hypothesis_.replace('\n', '<newline>')
589 |         bleu_score = eval_bleu(ref=ref_tok, hyp=hyp_tok)
590 |         print('BLEU =', bleu_score)
591 |         if bleu_score > best_bleu:
592 |             best_bleu = bleu_score
593 |         print('SYNTAX', 'INCORRECT' if not_compiled else 'CORRECT')
594 |         if not not_compiled:
595 |             if not best_syntax:
596 |                 best_syntax = True
597 |             print(example_path, 'IO: ', f'{model_corrects}/{len(examples) - bad_examples_in_benchmark}')
598 |         else:
599 |             print(example_path, 'IO: N/A (Error:', stderr, ')')
600 |         print()
601 |         if not not_compiled:
602 |             os.remove(tmp_s)
603 |             os.remove(output)
604 | 
605 |         if model_corrects > max_model_corrects:
606 |             max_model_corrects = model_corrects
607 |     return gcc_corrects == len(examples), max_model_corrects == gcc_corrects and gcc_corrects > 0, best_syntax, best_bleu
608 | 
609 | 
610 | def run(synthesis_eval_path: str, predictions_path: str):
611 |     standaloner_code_works = 0
612 |     total = 0
613 |     none_in_code = 0
614 |     benchmark_oks = 0
615 |     model_oks = 0
616 |     syntax_oks = 0
617 |     bleu = 0.0
618 |     for idx, example in enumerate(sorted(os.listdir(os.path.join(synthesis_eval_path, 'examples')))):
619 |         example_path = os.path.join(synthesis_eval_path, 'examples', example)
620 |         if example.startswith('__') or not os.path.isdir(example_path) or example in BAD_CASES:
621 |             continue
622 |         total += 1
623 |         c_path = os.path.join(example_path, 'ref.c')
624 |         parsed_signature, _ = parse_file(c_path)
625 |         with open(c_path, 'r') as c:
626 |             c_code = c.read()
627 |         props_path = os.path.join(example_path, 'props')
628 |         with open(props_path, 'r') as p:
629 |             props = p.read()
630 |         try:
631 |             c_imp, main_code = signature2standalone(parsed_signature, c_code, props, examples=get_examples(example_path, use_simpl_instead_of_L2=True, scalars_first=True, signature=parsed_signature))
632 |             if c_imp.count('None') > 0:
633 |                 none_in_code += 1
634 |             gcc = GCC(print_stderr=False)
635 |             if len(gcc.compile(c_imp).splitlines()) > 1:
636 |                 standaloner_code_works += 1
637 |             print('-------------------')
638 | 
639 |             benchmark_ok, model_ok, best_syntax, best_bleu = run_io(c_imp, example_path, just_func_code=c_code, main_code=main_code, signature=parsed_signature, predictions_path=predictions_path, use_simpl_instead_of_L2=True)
640 |             if not benchmark_ok:
641 |                 benchmark_ok, model_ok, best_syntax, best_bleu = run_io(c_imp, example_path, just_func_code=c_code, main_code=main_code, signature=parsed_signature,
642 |                        predictions_path=predictions_path, use_simpl_instead_of_L2=False)
643 |             if benchmark_ok:
644 |                 benchmark_oks += benchmark_ok
645 |                 model_oks += model_ok
646 |                 syntax_oks += best_syntax
647 |                 bleu += best_bleu
648 |                 def str_ok(b):
649 |                     return "OK" if b else "NOT OK"
650 |                 print('Benchmark OK!')
651 |                 complexity_ref = lizard.analyze_file(c_path).__dict__['function_list'][0].__dict__
652 |                 cyclomatic = complexity_ref['cyclomatic_complexity']
653 |                 nloc = complexity_ref['nloc']
654 |                 tokens = complexity_ref['token_count']
655 |                 params = len(complexity_ref['parameters'])
656 |                 pointers = complexity_ref['long_name'].count('*')
657 |                 print(f'{example}: IO = {str_ok(model_ok)} | SYNTAX = {str_ok(best_syntax)} | BLEU = {best_bleu}'
658 |                       f' | C_NLOC = {nloc} | C_TOKENS = {tokens} | C_CYCLO = {cyclomatic} | PARAMS = {params} | POINTERS = {pointers}')
659 |             else:
660 |                 print('Benchmark NOT OK!')
661 |         except (NotImplementedError, FileNotFoundError) as e:
662 |             print('Benchmark NOT OK!')
663 | 
664 |     print('\nBenchmark ok:', benchmark_oks, 'of', total)
665 |     print('IO ok:', model_oks, 'of', benchmark_oks)
666 |     print('Syntax ok:', syntax_oks, 'of', benchmark_oks)
667 |     print('Avg BLEU:', bleu/benchmark_oks)
668 | 
669 | 
670 | if __name__ == '__main__':
671 |     import argparse
672 |     from neural_compilers.utils.utilities import init_logging
673 |     import os
674 |     parser = argparse.ArgumentParser('IO Evaluator')
675 |     parser.add_argument('--synthesis-eval-path', type=str)
676 |     parser.add_argument('--predictions-path', type=str)
677 |     args = parser.parse_args()
678 |     # Set up logging etc
679 |     timestamp = time.strftime("%Y-%m-%d-%H%M")
680 |     repo = git.Repo(search_parent_directories=True)
681 |     sha = repo.head.object.hexsha
682 |     extra_id = uuid.uuid4().hex
683 |     name = f'eval-io-legacy-{timestamp}-{sha[:4]}-{extra_id[:4]}'
684 |     eval_path = os.path.join(os.path.dirname(args.predictions_path), name)
685 |     os.mkdir(eval_path)
686 |     init_logging(os.path.join(eval_path, name + '.log'))
687 |     print(args)
688 |     run(synthesis_eval_path=args.synthesis_eval_path, predictions_path=args.predictions_path)
689 |     print(os.path.join(eval_path, name + '.log'))
690 | 


--------------------------------------------------------------------------------
/evaluate-syntax.py:
--------------------------------------------------------------------------------
 1 | from neural_compilers.eval.evaluator import GASSyntaxEvaluator
 2 | import argparse
 3 | from typing import Dict
 4 | from tokenizers import Tokenizer
 5 | import os
 6 | from typing import Optional
 7 | from pathlib import Path
 8 | from copy import deepcopy
 9 | import json
10 | from neural_compilers.eval.evaluator import save_eval
11 | 
12 | SUBWORD_TAG = '##'
13 | 
14 | 
15 | def evaluate(system_output_path: str, decode_subwords: bool, system_path: Optional[str]=None) -> Dict:
16 |     evaluator = GASSyntaxEvaluator()
17 |     results = []
18 | 
19 |     if os.path.isfile(system_output_path):
20 | 
21 |         with open(system_output_path, 'r') as f:
22 |             for compilation in f.readlines():
23 |                 if decode_subwords:
24 |                     compilation = compilation.replace(f' {SUBWORD_TAG}', '')
25 |                 results.append(evaluator.asm_is_valid(compilation))
26 |     else:
27 |         for file in sorted(Path(system_output_path).rglob('*.c')):
28 |             with open(file, 'r') as f:
29 |                 compilation = f.read()
30 |                 if decode_subwords:
31 |                     compilation = compilation.replace(f' {SUBWORD_TAG}', '')
32 |                 results.append(evaluator.asm_is_valid(compilation))
33 | 
34 |     return evaluator.aggregate(results)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser(description='Evaluate generation')
39 |     parser.add_argument('--system-output-path', type=str, help="Path to compilations. C directory or file."
40 |                                                                "If it's a file path, then one function per line."
41 |                                                                 "Otherwise, one function per file.")
42 |     parser.add_argument('--system-path', type=str, default=None, help='Path to system (tokenizer)')
43 |     parser.add_argument('--no-decode-subwords', action='store_true')
44 |     parser.add_argument('--config-file', type=str, help='Path to JSON file (instead of command line arguments)')
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     orig_args = deepcopy(args)
49 | 
50 |     if args.config_file:
51 |         with open(args.config_file, 'r') as f:
52 |             config = json.load(f)
53 |         args = argparse.Namespace(**config)
54 |     else:
55 |         config = vars(args)
56 |     args.decode_subwords = not args.no_decode_subwords
57 | 
58 |     results = evaluate(args.system_output_path, args.decode_subwords, args.system_path)
59 |     print(results)
60 | 
61 |     save_eval(system_output_path=config['system_output_path'], eval_script=__file__, config=config,
62 |               results=dict(syntax=results))
63 | 


--------------------------------------------------------------------------------
/experiments/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jordiae/neural-compilers/e1b109056730f9cf8b234293a5e4f8056b20727c/experiments/.keep


--------------------------------------------------------------------------------
/infer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | import os
  4 | from neural_compilers.utils.config import TokenizerConfig
  5 | from neural_compilers.utils.tokenization import CodeTokenizer, SubwordTokenizer
  6 | import json
  7 | import git
  8 | import time
  9 | from neural_compilers.utils.utilities import run_command
 10 | import uuid
 11 | from neural_compilers.utils.utilities import init_logging
 12 | from typing import List
 13 | from typing import Tuple
 14 | import shutil
 15 | from neural_compilers.utils.utilities import get_tmp_file
 16 | import logging
 17 | from copy import deepcopy
 18 | 
 19 | 
 20 | def tokenize(raw_code: str, subword_tokenizer: SubwordTokenizer, code_tokenizer: CodeTokenizer,
 21 |              replace_true_false: bool) -> str:
 22 |     c_code = raw_code
 23 |     # hack to avoid includes: TODO: review
 24 |     new_c_code = ''
 25 |     for line in c_code.splitlines():
 26 |         if not line.startswith('#'):
 27 |             new_c_code += line
 28 |     if 'stdbool' in raw_code and replace_true_false:
 29 |         new_c_code = raw_code.replace('true', '1').replace('false', '0')
 30 |     c_code = new_c_code
 31 |     pretok_c = ' '.join(code_tokenizer.tokenize(c_code, lang='c'))
 32 |     # subword tokenization
 33 |     tok_c = ' '.join(subword_tokenizer.tokenize_program(pretok_c))
 34 |     return tok_c
 35 | 
 36 | 
 37 | def cat_tokenize(c_path: str, model_path: str, eval_path: str, replace_true_false: bool) -> Tuple[str, List[str]]:
 38 |     with open(os.path.join(model_path, 'gen-data-config.json'), 'r') as f:
 39 |         tok_config_dict = json.load(f)['tokenizer_config']
 40 |     tok_config = TokenizerConfig(**tok_config_dict)
 41 |     code_tokenizer = CodeTokenizer.from_config(tok_config)
 42 |     tokenizer = SubwordTokenizer.from_config(tok_config, output_dir='__')
 43 |     tokenizer.restore(os.path.join(model_path, 'tokenizer.json'))
 44 |     cat_path = os.path.join(eval_path, 'all.c')
 45 |     c_files = sorted(Path(c_path).rglob('*.c'))
 46 |     if os.path.isdir(c_path):
 47 |         with open(cat_path, 'w') as outfile:
 48 |             for c_file in c_files:
 49 |                 with open(c_file, 'r') as infile:
 50 |                     outfile.write(tokenize(infile.read(), subword_tokenizer=tokenizer, code_tokenizer=code_tokenizer,
 51 |                                            replace_true_false=replace_true_false)
 52 |                                   + '\n')
 53 |         return cat_path, list(map(str, c_files))
 54 |     else:
 55 |         with open(cat_path, 'w') as outfile:
 56 |             with open(c_path, 'r') as infile:
 57 |                 for line in infile.readlines():
 58 |                     outfile.write(tokenize(line, subword_tokenizer=tokenizer, code_tokenizer=code_tokenizer,
 59 |                                            replace_true_false=replace_true_false))
 60 |         return cat_path, [c_path]
 61 | 
 62 | 
 63 | def run_model(data_path: str, model_path: str, beam: int, top_n: int, checkpoint_path: str) -> List[List[str]]:
 64 |     directions = '-s c -t s'
 65 |     if not os.path.isabs(checkpoint_path):
 66 |         checkpoint_path = os.path.join(model_path, checkpoint_path)
 67 |     command = f'fairseq-interactive {model_path} {directions} --path {checkpoint_path} --beam {beam} --nbest {top_n}'
 68 | 
 69 |     with open(data_path, 'r') as f:
 70 |         c_code = f.read()
 71 |     stdout, stderr = run_command(command, stdin=c_code)
 72 |     res = []
 73 |     current = []  # potentially multiple hypotheses
 74 |     for l in stdout.splitlines():
 75 |         if l.startswith('H'):
 76 |             detokenized = l.split('\t')[2].replace(' ##', '')
 77 |             current.append(detokenized)
 78 |         elif l.startswith('S') and len(current) > 0:
 79 |             res.append(current)
 80 |             current = []
 81 |     if len(current) > 0:
 82 |         res.append(current)
 83 |     if len(res) == 0:
 84 |         print('ERROR')
 85 |         logging.error(stderr)
 86 |         exit()
 87 |     return res
 88 | 
 89 | 
 90 | def get_asm_header_and_footer(c_path: str) -> Tuple[str, str]:
 91 |     # first compile with gcc to get header and footer
 92 |     with open(c_path, 'r') as f:
 93 |         c = f.read()
 94 |     tmp_asm = get_tmp_file(c, extension='.s')
 95 |     stdout, stderr = run_command(f'gcc -O0 -c -S -o {tmp_asm} {c_path}')
 96 |     with open(tmp_asm, 'r') as f:
 97 |         asm = f.readlines()
 98 |     header = ''
 99 |     for line in asm:
100 |         if ':' in line:
101 |             break
102 |         if 'file' in line:
103 |             continue
104 |         header += line
105 |     with open(tmp_asm, 'r') as f:
106 |         asm = f.read()
107 |     os.remove(tmp_asm)
108 |     _, _, footer = asm.partition('.cfi_endproc')
109 |     return header, footer
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     import argparse
114 | 
115 |     parser = argparse.ArgumentParser('Program to prepare reference C code to be compiled by a model.')
116 |     parser.add_argument('--c-path', help="C directory or file. If it's a file path, then one function per line."
117 |                                          "Otherwise, one function per file.")
118 |     parser.add_argument('--model-path', help='Directory with the model (checkpoint, tokenizer)')
119 |     parser.add_argument('--beam', type=int, default=1, help='Beam size')
120 |     parser.add_argument('--top-n', type=int, default=1, help='Print top N hypotheses')
121 |     parser.add_argument('--add-header-footer', action='store_true', help='Add assembly header and footer')
122 |     parser.add_argument('--replace-true-false', action='store_true', help='Replace true/false in C with 1/0')
123 |     parser.add_argument('--checkpoint-path', type=str, default='checkpoint_best.pt', help='Checkpoint path relative to'
124 |                                                                                           'the model path, or absolute.'
125 |                                                                                           'Default: checkpoint_best.pt')
126 |     parser.add_argument('--config-file', type=str, help='Path to JSON file (instead of command line arguments)')
127 | 
128 |     args = parser.parse_args()
129 | 
130 |     orig_args = deepcopy(args)
131 | 
132 |     if args.config_file:
133 |         with open(args.config_file, 'r') as f:
134 |             config = json.load(f)
135 |         args = argparse.Namespace(**config)
136 |         args.config_file = orig_args.config_file
137 |     else:
138 |         config = vars(args)
139 | 
140 |     assert config['top_n'] <= config['top_n']
141 | 
142 |     # Set up logging etc
143 |     timestamp = time.strftime("%Y-%m-%d-%H%M")
144 |     repo = git.Repo(search_parent_directories=True)
145 |     sha = repo.head.object.hexsha
146 |     extra_id = uuid.uuid4().hex
147 |     name = f'infer-beam{args.beam}-topn{args.top_n}-{timestamp}-{sha[:4]}-{extra_id[:4]}'
148 |     eval_path = os.path.join(args.model_path, name)
149 |     os.makedirs(eval_path)
150 |     with open(os.path.join(eval_path, 'infer-config.json'), 'w') as f:
151 |         json.dump(config, f)
152 |     init_logging(os.path.join(eval_path, name + '.log'))
153 |     print(eval_path)
154 |     logging.info(args)
155 | 
156 |     logging.info('Preparing files...')
157 | 
158 |     # cat + tokenize (and found C files)
159 |     cat_path, c_files = cat_tokenize(c_path=args.c_path, model_path=args.model_path, eval_path=eval_path,
160 |                                      replace_true_false=args.replace_true_false)
161 | 
162 |     # interactive with different options
163 |     logging.info('Running the model...')
164 |     results = run_model(data_path=cat_path, model_path=args.model_path, beam=args.beam,
165 |                         top_n=args.top_n, checkpoint_path=args.checkpoint_path)
166 |     if os.path.isdir(args.c_path):
167 |         examples_path = os.path.join(eval_path, 'examples')
168 |         os.makedirs(examples_path)
169 |         for c_file, result_list in zip(c_files, results):
170 |             # TODO: insert header/footer?
171 |             function_name = c_file.split(os.path.sep)[-2]
172 |             new_path = os.path.join(examples_path, function_name)
173 |             os.makedirs(new_path, exist_ok=True)
174 |             shutil.copyfile(c_file, os.path.join(new_path, 'ref.c'))
175 |             for idx, result in enumerate(result_list):
176 |                 result = result.replace('<newline>', '\n')
177 |                 if args.add_header_footer:
178 |                     header, footer = get_asm_header_and_footer(c_file)
179 |                     result = header + result + footer
180 |                 with open(os.path.join(new_path, function_name + f'-{idx+1}.s'), 'w')as f:
181 |                     f.write(result)
182 |     else:
183 |         with open(args.c_path, 'r') as r:
184 |             for orig_line, (idx, result) in zip(r.readlines(), enumerate(results[0])):
185 | 
186 |                     with open(os.path.join(eval_path, os.path.basename(args.c_path).split('.')[0] + f'-{idx+1}.s'), 'a+') as f:
187 |                         if args.add_header_footer:
188 |                             tmp = get_tmp_file(os.path.join(eval_path, args.c_path), extension='c')
189 |                             header, footer = get_asm_header_and_footer(tmp)
190 |                             os.remove(tmp)
191 |                             result = header + result + footer
192 |                         f.write(result + '\n')
193 | 


--------------------------------------------------------------------------------
/job-gen.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uuid
 3 | import time
 4 | from typing import Optional
 5 | 
 6 | 
 7 | def get_template(name: str, system: str, config_path: str, job_type: str, n_gpus: Optional[int] = None) -> str:
 8 |     if system == 'cluster' and job_type in ['train', 'infer']:
 9 |         template = f'''#!/bin/bash
10 | #SBATCH --gres=gpu:{n_gpus}
11 | #SBATCH --mem=20G
12 | #SBATCH --output=slurm_logs/{name}_%j.log'''
13 |     elif system == 'calcula':
14 |         template = f'''#!/bin/bash
15 | #SBATCH -p veu # Partition to submit to
16 | #SBATCH --mem=20G
17 | #SBATCH --output=slurm_logs/{name}_%j.log
18 | source ~/.bashrc
19 | '''
20 |     elif system in ['local', 'workstation']:
21 |         template = ''
22 |     else:
23 |         raise NotImplementedError()
24 |     if n_gpus == 0:
25 |         template = template.replace('SBATCH --gres=gpu:0\n ', '')
26 |     template += '\nsource venv/bin/activate'
27 |     if job_type == 'train':
28 |         cmd = f'bash {config_path}'
29 |     elif job_type == 'data':
30 |         cmd = f'python prepare-train-data.py --config-file {config_path}'
31 |     elif job_type == 'infer':
32 |         cmd = f'python infer.py --config-file {config_path}'
33 |     elif job_type == 'eval-io':
34 |         cmd = f'python evaluate-io.py --config-file {config_path}'
35 |     elif job_type == 'eval-bleu':
36 |         cmd = f'python evaluate-bleu.py --config-file {config_path}'
37 |     elif job_type == 'eval-syntax':
38 |         cmd = f'python evaluate-syntax.py --config-file {config_path}'
39 |     else:
40 |         raise NotImplementedError()
41 |     template += f'\n{cmd}\n'
42 |     return template
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     import argparse
47 |     parser = argparse.ArgumentParser('Job generator')
48 |     parser.add_argument('--system', type=str, choices=['cluster', 'local', 'workstation'],
49 |                         help='Where the run will be run')
50 |     parser.add_argument('--config-path', type=str, help='Path to config')
51 |     parser.add_argument('--job-type', type=str, choices=['data', 'train', 'infer', 'eval-io', 'eval-bleu',
52 |                                                          'eval-syntax'], help='Whether it is a data, training,'
53 |                                                                               'inference, or evaluation job')
54 |     parser.add_argument('--gpus', type=int, default=1)
55 |     args = parser.parse_args()
56 |     timestamp = time.strftime("%Y-%m-%d-%H%M")
57 |     extra_id = uuid.uuid4().hex
58 |     stamp = f'{timestamp}-{extra_id[:4]}'
59 |     job_name = f'{args.system}-{os.path.basename(args.config_path)}-{stamp}'
60 |     job_path = os.path.join('jobs', job_name + '.sh')
61 |     template = get_template(job_name, args.system, args.config_path, args.job_type, args.gpus)
62 | 
63 |     with open(job_path, 'w') as f:
64 |         f.write(template)
65 |     if args.system == 'local':
66 |         cmd = 'bash'
67 |     else:
68 |         cmd = 'sbatch'
69 |     cmd += ' ' + job_path
70 |     print(cmd)
71 | 


--------------------------------------------------------------------------------
/jobs/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jordiae/neural-compilers/e1b109056730f9cf8b234293a5e4f8056b20727c/jobs/.keep


--------------------------------------------------------------------------------
/neural_compilers/eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from neural_compilers.utils.utilities import run_command, get_tmp_path
 4 | from typing import List
 5 | from neural_compilers.utils.constants import NEWLINE_ESCAPE
 6 | from typing import Dict
 7 | import git
 8 | import time
 9 | import uuid
10 | import json
11 | 
12 | 
13 | def save_eval(system_output_path: str, eval_script: str, config: Dict, results: Dict):
14 |     # Set up logging etc
15 |     timestamp = time.strftime("%Y-%m-%d-%H%M")
16 |     repo = git.Repo(search_parent_directories=True)
17 |     sha = repo.head.object.hexsha
18 |     extra_id = uuid.uuid4().hex
19 |     name = f'{eval_script}-{timestamp}-{sha[:4]}-{extra_id[:4]}'
20 |     if os.path.isfile(system_output_path):
21 |         system_output_path = system_output_path + '_' + name
22 |     eval_path = os.path.join(system_output_path, name)
23 |     os.makedirs(eval_path)
24 |     with open(os.path.join(eval_path, 'eval-config.json'), 'w') as f:
25 |         json.dump(config, f)
26 |     with open(os.path.join(eval_path, 'results.json'), 'w') as f:
27 |         json.dump(results, f)
28 | 
29 | 
30 | class Evaluator:
31 |     def asm_is_valid(self, asm: str) -> bool:
32 |         raise NotImplementedError
33 | 
34 | 
35 | class GASSyntaxEvaluator:
36 |     def asm_is_valid(self, asm: str) -> bool:
37 |         asm = asm.replace(NEWLINE_ESCAPE, '\n')
38 |         # make gcc read from stdin and write to stdout
39 |         # print(asm)
40 |         # Note: We CANNOT print the assembler output to stdout. See:
41 |         # https://stackoverflow.com/questions/47181017/why-cant-i-pipe-assembler-output-to-stdout
42 |         # stdout, stderr = run_command(f'gcc -c -x assembler -o /dev/stdout -', stdin=asm)
43 |         tmp_path = get_tmp_path()
44 |         stdout, stderr = run_command(f'gcc -c -x assembler -o {tmp_path} -', stdin=asm)
45 |         if os.path.exists(tmp_path):  # compiled
46 |             os.remove(tmp_path)
47 |             return True
48 |         # Print wrong assembly (for error analysis)
49 |         print('-------------')
50 |         print('INVALID ASSEMBLY!')
51 |         print('ERROR:')
52 |         print(stderr)
53 |         print('ASSEMBLY:')
54 |         print()
55 |         print(asm)
56 |         print('-------------')
57 |         print()
58 |         return False
59 | 
60 | 
61 |     def aggregate(self, corrects: List[bool]) -> Dict:
62 |         return {'syntactic_accuracy': sum(corrects)/len(corrects), 'valid_compilations': sum(corrects),
63 |                 'total': len(corrects)}
64 | 
65 | 


--------------------------------------------------------------------------------
/neural_compilers/train/data_generator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import git
  4 | import ntpath
  5 | import time
  6 | import datetime
  7 | import uuid
  8 | from dirhash import dirhash
  9 | from dataclasses import asdict
 10 | from typing import Dict
 11 | from pathlib import Path
 12 | import os
 13 | from neural_compilers.utils.config import DataGenConfig
 14 | from neural_compilers.utils.tokenization import GCC, CodeTokenizer, SubwordTokenizer, Compiler
 15 | from neural_compilers.utils.utilities import run_command, timeit
 16 | from typing import Optional
 17 | from tqdm import tqdm
 18 | import shutil
 19 | import multiprocessing as mp
 20 | 
 21 | JOBS = 8
 22 | 
 23 | 
 24 | class DataGen:
 25 |     def __init__(self, config: DataGenConfig):
 26 |         self._config = config
 27 |         self._code_tokenizer: Optional[CodeTokenizer] = None
 28 |         self._subword_tokenizer: Optional[SubwordTokenizer] = None
 29 |         self._compiler: Optional[Compiler] = None
 30 |         self._output_dir = None
 31 |         self.is_setup = False
 32 | 
 33 |     def setup(self):
 34 |         assert not self.is_setup
 35 |         timestamp = time.strftime("%Y-%m-%d-%H%M")
 36 |         output_path = self._config.output_path
 37 |         input_name = ntpath.basename(self._config.input_path)
 38 |         repo = git.Repo(search_parent_directories=True)
 39 |         sha = repo.head.object.hexsha
 40 |         extra_id = uuid.uuid4().hex
 41 |         input_hash = dirhash(self._config.input_path, 'md5', jobs=JOBS)
 42 |         supervised = 'supervised' if self._config.supervised else 'unsupervised'
 43 |         output_dir = os.path.join(output_path,
 44 |                                   f'{input_name}-{supervised}-{timestamp}-{input_hash}-{sha[:4]}-'
 45 |                                   f'{extra_id[:4]}')
 46 |         self._output_dir = output_dir
 47 |         os.makedirs(output_dir)
 48 |         logging.basicConfig(filename=os.path.join(output_dir, 'gen-data.log'), level=logging.INFO)
 49 |         logging.getLogger('').addHandler(logging.StreamHandler())
 50 |         logging.info('hey')
 51 |         with open(os.path.join(output_dir, 'gen-data-config.json'), 'w') as f:
 52 |             json.dump(asdict(self._config), f, indent=4)
 53 | 
 54 |         self._code_tokenizer = CodeTokenizer.from_config(self._config.tokenizer_config)
 55 |         self._subword_tokenizer = SubwordTokenizer.from_config(self._config.tokenizer_config,
 56 |                                                                output_dir=self._output_dir)
 57 |         self._compiler = GCC()
 58 |         self.is_setup = True
 59 | 
 60 |     def _compile(self, raw_c: str) -> str:
 61 |         return self._compiler.compile(raw_c)
 62 | 
 63 |     def generate(self):
 64 |         t0 = datetime.datetime.now().timestamp()
 65 |         self.setup()
 66 |         stats = self._process_dataset(output_dir=self._output_dir)
 67 |         with open(os.path.join(self._output_dir, 'stats.json'), 'w') as f:
 68 |             json.dump(stats, f, indent=4)
 69 |         t1 = datetime.datetime.now().timestamp()
 70 |         logging.info(f'Total: Elapsed {t1 - t0}s')
 71 | 
 72 |     def _process_file(self, path):
 73 |         # Read C, compile to assembly, discard (if required), and collect stats
 74 |         with open(path, 'r') as c_file:
 75 |             raw_c = c_file.read()
 76 |         raw_asm = self._compile(raw_c)  # self._compile(path, output_dir=output_dir)
 77 |         if len((raw_asm.split())) == 0:
 78 |             print('WARNING: Skipping empty assembly (cmpile error?)')
 79 |             return None
 80 |         # Pretokenize
 81 |         pretok_c = self._code_tokenizer.tokenize(raw_c, lang='c', just_func=self._config.just_func)
 82 |         pretok_asm = self._code_tokenizer.tokenize(raw_asm, lang='asm', just_func=self._config.just_func)
 83 |         total_len = len(pretok_c) + len(pretok_asm)
 84 |         # Note that we are discarding by length BEFORE subword tokenization.
 85 |         # It is neither correct or incorrect, but a rather arbitrary design choice
 86 |         if len(pretok_asm) < self._config.min_tokens // 2:
 87 |             return None
 88 |         elif total_len < self._config.min_tokens:
 89 |             return None
 90 |         elif total_len > self._config.max_tokens:
 91 |             return None
 92 |         else:
 93 |             c_path = os.path.join(self._output_dir, path)
 94 |             with open(c_path, 'w') as f:
 95 |                 f.write(' '.join(pretok_c) + '\n')
 96 |             asm_path = c_path[:-1] + 's'
 97 |             with open(asm_path, 'w') as f:
 98 |                 f.write(' '.join(pretok_asm) + '\n')
 99 |             return c_path
100 | 
101 |     def _process_dataset(self, output_dir: str) -> Dict:
102 |         stats = {'c_length': 0, 'asm_length': 0, 'discarded_min': 0, 'discarded_max': 0, 'kept': 0}
103 |         kept_files = []
104 |         c_corpus_path = os.path.join(output_dir, 'corpus.c')
105 |         c_corpus_subword = os.path.join(output_dir, 'corpus.tok.c')
106 |         asm_corpus_path = os.path.join(output_dir, 'corpus.s')
107 |         asm_corpus_subword = os.path.join(output_dir, 'corpus.tok.s')
108 | 
109 | 
110 |         def ig_f(dir, files):
111 |             return [f for f in files if os.path.isfile(os.path.join(dir, f))]
112 | 
113 |         mirrored_dir = os.path.join(self._output_dir, os.path.basename(self._config.input_path))
114 |         shutil.copytree(self._config.input_path, mirrored_dir, ignore=ig_f)
115 | 
116 | 
117 |         @timeit
118 |         def read_tokenize_discard():
119 | 
120 |             tasks = list(Path(self._config.input_path).rglob('*.c'))#[:10000]
121 |             with mp.Pool() as p:
122 |                 for _ in tqdm(p.imap_unordered(self._process_file, tasks), total=len(tasks)):
123 |                    pass
124 | 
125 |         read_tokenize_discard()
126 | 
127 |         # cat
128 |         # TODO: here we cannot use run_command because we are piping (fix)
129 |         command = "find " + mirrored_dir + "  -iname '*.c' -print0 | sort -zn | xargs -0 -I '{}' cat '{}' - > " + os.path.join(self._output_dir, 'corpus.c')
130 |         os.system(command)
131 |         print(command)
132 |         command = "find " + mirrored_dir + "  -iname '*.s' -print0 | sort -zn | xargs -0 -I '{}' cat '{}' - > " + os.path.join(
133 |             self._output_dir, 'corpus.s')
134 |         os.system(command)
135 |         print(command)
136 | 
137 | 
138 |         # Train-valid-test split
139 | 
140 |         @timeit
141 |         def train_valid_test_split():
142 |             # np.random.shuffle(kept_files)
143 |             os.system(f"shuf {os.path.join(self._output_dir, 'corpus.c')} -o {os.path.join(self._output_dir, 'corpus.shuf.c')} --random-source={os.path.join(self._output_dir, 'corpus.c')}")
144 |             os.system(f"shuf {os.path.join(self._output_dir, 'corpus.s')} -o {os.path.join(self._output_dir, 'corpus.shuf.s')} --random-source={os.path.join(self._output_dir, 'corpus.c')}")
145 |             with open(os.path.join(self._output_dir, 'corpus.shuf.c'), 'r') as c, open(os.path.join(self._output_dir, 'corpus.shuf.s'), 'r') as s:
146 |                 lines = c.readlines()
147 |                 with open(os.path.join(self._output_dir, 'test.c'), 'w') as t:
148 |                     t.writelines(lines[:self._config.valid_test_size])
149 |                 with open(os.path.join(self._output_dir, 'valid.c'), 'w') as t:
150 |                     t.writelines(lines[self._config.valid_test_size:self._config.valid_test_size*2])
151 |                 with open(os.path.join(self._output_dir, 'train.c'), 'w') as t:
152 |                     if self._config.max_train_data:
153 |                         train_size = len(lines) - self._config.valid_test_size * 2
154 |                         new_train_size = self._config.max_train_data
155 |                         assert train_size >= new_train_size
156 |                         diff = train_size - new_train_size
157 |                         max_idx = len(lines) - diff
158 |                         t.writelines(lines[self._config.valid_test_size * 2:max_idx])
159 |                     else:
160 |                         t.writelines(lines[self._config.valid_test_size*2:])
161 | 
162 |                 lines = s.readlines()
163 |                 with open(os.path.join(self._output_dir, 'test.s'), 'w') as t:
164 |                     t.writelines(lines[:self._config.valid_test_size])
165 |                 with open(os.path.join(self._output_dir, 'valid.s'), 'w') as t:
166 |                     t.writelines(lines[self._config.valid_test_size:self._config.valid_test_size * 2])
167 |                 with open(os.path.join(self._output_dir, 'train.s'), 'w') as t:
168 |                     if self._config.max_train_data:
169 |                         t.writelines(lines[self._config.valid_test_size * 2:max_idx])
170 |                     else:
171 |                         t.writelines(lines[self._config.valid_test_size * 2:])
172 | 
173 | 
174 |         train_valid_test_split()
175 | 
176 | 
177 |         # Learn subword tokenizer
178 |         # joint
179 |         self._subword_tokenizer.train([os.path.join(self._output_dir, 'train.c'), os.path.join(self._output_dir, 'train.s')])
180 | 
181 | 
182 |         # Apply subword tokenizer + write tokenized (text files à la Fairseq)
183 |         @timeit
184 |         def apply_tokenizer():
185 |             for subset in ['train.c', 'valid.c', 'test.c', 'train.s', 'valid.s', 'test.s']:
186 |                 path = os.path.join(self._output_dir, subset)
187 |                 tok_path = os.path.join(self._output_dir, subset.split('.')[0] + '.tok.' + subset.split('.')[1])
188 |                 with open(tok_path, 'w') as tok_file:
189 |                     with open(path, 'r') as pretok_file:
190 |                         for line in pretok_file:
191 |                             tokenized = ' '.join(self._subword_tokenizer.tokenize_program(line))
192 |                             tok_file.write(tokenized + '\n')
193 | 
194 |         apply_tokenizer()
195 | 
196 |         # Fairseq preprocessing
197 |         src_lang = 'c'
198 |         tgt_lang = 's'
199 |         data_path = self._output_dir
200 |         train_prefix = os.path.join(data_path, 'train.tok')
201 |         valid_prefix = os.path.join(data_path, 'valid.tok')
202 |         test_prefix = os.path.join(data_path, 'test.tok')
203 |         destdir = data_path
204 |         threshold_vocab = 0
205 |         workers = JOBS
206 |         dict_options = '--joined-dictionary'
207 |         fairseq_preprocess_comand = f'''
208 |         fairseq-preprocess \
209 |         --source-lang {src_lang} --target-lang {tgt_lang} \
210 |         --trainpref {train_prefix} --validpref {valid_prefix} --testpref {test_prefix} \
211 |         --destdir {destdir} --thresholdtgt {threshold_vocab} --thresholdsrc {threshold_vocab} {dict_options} \
212 |         --workers {workers}
213 |         '''
214 |         print(fairseq_preprocess_comand)
215 |         stdout, stderr = run_command(fairseq_preprocess_comand)
216 |         print(stdout)
217 |         print(stderr)
218 | 
219 |         return stats
220 | 


--------------------------------------------------------------------------------
/neural_compilers/utils/config.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from dataclasses import dataclass
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class TokenizerType(str, Enum):
 7 |     PYGMENTS = 'pygments'
 8 | 
 9 | 
10 | class SubwordType(str, Enum):
11 |     SUBWORD_NMT = 'subword-nmt'
12 | 
13 | 
14 | @dataclass
15 | class TokenizerConfig:
16 |     tokenizer_type: TokenizerType
17 |     subword_tokenizer: SubwordType
18 |     subword_vocab_size: int
19 |     shared_vocab: bool
20 | 
21 | 
22 | @dataclass
23 | class DataGenConfig:
24 |     input_path: str
25 |     output_path: str
26 |     min_tokens: int
27 |     max_tokens: int
28 |     supervised: bool
29 |     valid_test_size: int
30 |     seed: int
31 |     tokenizer_config: TokenizerConfig
32 |     just_func: bool = False
33 |     config_path: Optional[str] = None
34 |     max_train_data: Optional[int] = None
35 | 
36 |     @classmethod
37 |     def from_dict(cls, d):
38 |         res = cls(**d)
39 |         res.tokenizer_config = TokenizerConfig(**d['tokenizer_config'])
40 |         return res
41 | 


--------------------------------------------------------------------------------
/neural_compilers/utils/constants.py:
--------------------------------------------------------------------------------
1 | NEWLINE_ESCAPE = '<newline>'
2 | 


--------------------------------------------------------------------------------
/neural_compilers/utils/tokenization.py:
--------------------------------------------------------------------------------
  1 | # https://discuss.huggingface.co/t/how-to-add-additional-custom-pre-tokenization-processing/1637
  2 | # https://github.com/huggingface/tokenizers/blob/b24a2fc1781d5da4e6ebcd3ecb5b91edffc0a05f/bindings/python/examples/custom_components.py
  3 | 
  4 | from tokenizers import Tokenizer
  5 | from tokenizers.models import BPE
  6 | from tokenizers.trainers import BpeTrainer
  7 | from pygments import lexers
  8 | import re
  9 | from typing import List, Union
 10 | from neural_compilers.utils.utilities import run_command
 11 | import multiprocessing as mp
 12 | from neural_compilers.utils.config import SubwordType, TokenizerConfig, TokenizerType
 13 | import logging
 14 | from neural_compilers.utils.utilities import timeit
 15 | from neural_compilers.utils.constants import NEWLINE_ESCAPE
 16 | from pygments.token import Token
 17 | from typing import Optional
 18 | from tokenizers.pre_tokenizers import WhitespaceSplit
 19 | 
 20 | 
 21 | class Compiler:
 22 |     def compile(self, c_text: str) -> str:
 23 |         raise NotImplementedError
 24 | 
 25 |     def preprocess(self, c_text: str) -> str:
 26 |         raise NotImplementedError
 27 | 
 28 | 
 29 | class GCC(Compiler):
 30 |     def __init__(self, print_stderr: bool = False):
 31 |         super().__init__()
 32 |         self._print_stderr = print_stderr
 33 | 
 34 |     def compile(self, c_text: str) -> str:
 35 |         # make gcc read from stdin and write to stdout
 36 |         stdout, stderr = run_command(f'gcc -S -O0 -x c -o /dev/stdout -', stdin=c_text)
 37 |         if self._print_stderr and stderr:
 38 |             print(stderr)
 39 |         return stdout
 40 | 
 41 |     def preprocess(self, c_text: str) -> str:
 42 |         # make gcc read from stdin and write to stdout
 43 |         stdout, stderr = run_command(f'gcc -E -x c -o /dev/stdout -', stdin=c_text)
 44 |         if stderr:
 45 |             pass  # logging.warning(stderr)
 46 |         return stdout
 47 | 
 48 | 
 49 | class CodeTokenizer:
 50 |     @classmethod
 51 |     def from_config(cls, config: TokenizerConfig):
 52 |         if config.tokenizer_type == TokenizerType.PYGMENTS:
 53 |             return PygmentsTokenizer()
 54 |         raise NotImplementedError(config.tokenizer_type)
 55 | 
 56 |     def _tokenize_c(self, program: str, just_func: bool = False) -> List[str]:
 57 |         raise NotImplementedError
 58 | 
 59 |     def _tokenize_asm(self, program: str, just_func: bool = False) -> List[str]:
 60 |         raise NotImplementedError
 61 | 
 62 |     def tokenize(self, programs: Union[str, List[str]], lang: str,
 63 |                  par: bool = False, just_func: bool = False) -> Union[List[str], List[List[str]]]:
 64 |         is_str = False
 65 |         if isinstance(programs, str):
 66 |             is_str = True
 67 |             programs = [programs]
 68 |         if lang == 'c':
 69 |             tokenize_func = self._tokenize_c
 70 |         elif lang == 'asm':
 71 |             tokenize_func = self._tokenize_asm
 72 |         else:
 73 |             raise ValueError(lang)
 74 |         if par:
 75 |             if just_func:
 76 |                 raise NotImplementedError
 77 |             with mp.Pool() as pool:
 78 |                 tokenized = pool.map(tokenize_func, programs)
 79 |         else:
 80 |             tokenized = list(map(lambda x: tokenize_func(x, just_func=just_func), programs))
 81 |         if is_str:
 82 |             tokenized = tokenized[0]
 83 |         return tokenized
 84 | 
 85 | 
 86 | class PygmentsTokenizer(CodeTokenizer):
 87 |     def __init__(self):
 88 |         super().__init__()
 89 |         self._c = lexers.get_lexer_by_name('c')
 90 |         self._asm = lexers.get_lexer_by_name('gas')
 91 |         self._c_compiler = GCC()
 92 | 
 93 |     def _tokenize_c(self, program: str, just_func: bool = False) -> List[str]:
 94 |         program = self._c_compiler.preprocess(program)
 95 | 
 96 |         # keep only the function, if required
 97 |         def is_header_of_implemented_func(s):
 98 |             return '(' in s and ';' not in s and 'while' not in s and 'if' not in s
 99 | 
100 |         if just_func:
101 |             lines = []
102 |             inside_func = False
103 |             for line in program.splitlines():
104 |                 if not inside_func and is_header_of_implemented_func(line):
105 |                     inside_func = True
106 |                 if inside_func:
107 |                     lines.append(line)
108 |             program = '\n'.join(lines)
109 | 
110 |         tokenized = [token.strip() for token_type, token in self._c.get_tokens(program) if len(token.split()) > 0 and
111 |                      token_type not in Token.Comment]
112 | 
113 |         return tokenized
114 | 
115 |     def _tokenize_asm(self, program: str, just_func: bool = False) -> List[str]:
116 |         filtered_program = []
117 |         # Filter metadata
118 |         inside_implemented_func = False
119 |         for line in program.splitlines():
120 |             if line.strip().startswith('.file'):  # file name (metadata)
121 |                 continue
122 |             if line.strip().startswith('.ident'):  # compiler/OS version (metadata):
123 |                 continue
124 |             if not just_func:
125 |                 filtered_program.append(line)
126 |             else:  # logic for just keeping the procedure
127 |                 if just_func and inside_implemented_func:
128 |                     if '.cfi_endproc' in line:
129 |                         filtered_program.append(line)
130 |                         break
131 | 
132 | 
133 |                 pattern = re.compile("(_[A-Z]|[a-z])\w:")
134 |                 if not inside_implemented_func and pattern.search(line):
135 |                     inside_implemented_func = True
136 |                 if inside_implemented_func:
137 |                     filtered_program.append(line)
138 | 
139 |         filtered_program = '\n'.join(filtered_program)
140 |         tokenized =\
141 |             [token for token_type, token in self._asm.get_tokens(filtered_program) if token_type not in Token.Comment]
142 |         # newline needed (end of statement)
143 |         tokenized = [NEWLINE_ESCAPE if token == '\n' else token for token in tokenized]
144 |         tokenized = (' '.join(tokenized)).split()
145 |         return tokenized
146 | 
147 | 
148 | class SubwordTokenizer:
149 |     def __init__(self, subword_tokenizer_type: SubwordType, subword_vocab_size: int, shared_vocab: bool,
150 |                  output_dir: str):
151 |         if not shared_vocab:
152 |             raise NotImplementedError('Not sharing vocab')
153 |         self._vocab_size = subword_vocab_size
154 |         self.type = subword_tokenizer_type
155 |         self.trained = False
156 |         self.hf_tokenizer: Optional[Tokenizer] = None
157 |         self.output_dir = output_dir
158 | 
159 |     @classmethod
160 |     def from_config(cls, config: TokenizerConfig, output_dir: str):
161 |         return cls(config.subword_tokenizer, config.subword_vocab_size, config.shared_vocab, output_dir=output_dir)
162 | 
163 |     def restore(self, tok_path: str):
164 |         assert not self.trained
165 |         self.hf_tokenizer = Tokenizer.from_file(tok_path)
166 |         self.trained = True
167 | 
168 |     @timeit
169 |     def train(self, files: List[str]):
170 |         assert not self.trained
171 |         if self.type != 'subword-nmt':
172 |             raise NotImplementedError(self.type)
173 | 
174 |         special_tokens = [
175 |             "<s>",
176 |             "<pad>",
177 |             "</s>",
178 |             "<unk>",
179 |             "<newline>"  # assembly
180 |         ]
181 |         import string
182 |         ascii_alphabet = string.printable
183 |         self.hf_tokenizer = Tokenizer(BPE())
184 |         # Needed to restore (HF bug): https://github.com/huggingface/tokenizers/issues/566
185 |         self.hf_tokenizer.pre_tokenizer = WhitespaceSplit()
186 |         trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=self._vocab_size, show_progress=True,
187 |                              initial_alphabet=ascii_alphabet, min_frequency=0, continuing_subword_prefix='##')
188 |         self.hf_tokenizer.train(files=files, trainer=trainer)
189 |         self.hf_tokenizer.enable_truncation(max_length=512)
190 |         logging.info(f"saving model tokenizer to {self.output_dir}")
191 |         import os
192 |         self.hf_tokenizer.save(os.path.join(self.output_dir, 'tokenizer.json'))
193 |         self.trained = True
194 | 
195 |     def tokenize_program(self, program: str) -> List[str]:
196 |         line = self.hf_tokenizer.encode(program, add_special_tokens=False).tokens  # no special tokens -> fairseq
197 |         return line
198 | 
199 |     def tokenize_programs(self, programs: List[str]) -> List[List[str]]:
200 |         tokenized = []
201 |         for program in programs:
202 |             tokenized.append(self.tokenize_program(program))
203 |         return tokenized
204 | 
205 | 


--------------------------------------------------------------------------------
/neural_compilers/utils/utilities.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import Tuple
 3 | import subprocess
 4 | import os
 5 | import numpy as np
 6 | import random
 7 | import logging
 8 | import time
 9 | from typing import Callable, Any, Optional
10 | import os
11 | 
12 | 
13 | def get_tmp_file(content: str, extension: str = '', dir: str ='') -> str:
14 |     filename = os.path.join(dir, uuid.uuid4().hex + extension)
15 |     with open(filename, 'w') as f:
16 |         f.write(content)
17 |     return filename
18 | 
19 | 
20 | def get_tmp_path() -> str:
21 |     filename = uuid.uuid4().hex
22 |     return filename
23 | 
24 | 
25 | def run_command(command: str, stdin: Optional[str] = None, timeout: Optional[int] = None) -> Tuple[str, str]:
26 |     output = subprocess.run(command.split(), capture_output=True, text=True, input=stdin, timeout=timeout)
27 |     return output.stdout, output.stderr
28 | 
29 | 
30 | def deterministic(seed: int):
31 |     os.environ['PYTHONHASHSEED'] = str(seed)
32 |     np.random.seed(seed)  # Numpy module.
33 |     random.seed(seed)  # Python random module.
34 | 
35 | 
36 | def timeit(func: Callable) -> Any:
37 |     def wrapped(*args, **kwargs):
38 |         func_name = func.__name__
39 |         logging.info(f'Running {func_name}')
40 |         t0 = time.time()
41 |         res = func(*args, **kwargs)
42 |         t1 = time.time()
43 |         logging.info(f'Run {func_name} in {t1-t0}s')
44 |         return res
45 |     return wrapped
46 | 
47 | 
48 | def init_logging(name: str):
49 |     logging.basicConfig(filename=name, level=logging.INFO)
50 |     logging.getLogger('').addHandler(logging.StreamHandler())
51 | 


--------------------------------------------------------------------------------
/prepare-train-data.py:
--------------------------------------------------------------------------------
 1 | from neural_compilers.utils.config import DataGenConfig, TokenizerConfig
 2 | from neural_compilers.train.data_generator import DataGen
 3 | import json
 4 | 
 5 | 
 6 | def main():
 7 |     import argparse
 8 |     parser = argparse.ArgumentParser('Train data generator')
 9 |     parser.add_argument('--config-file', help='Path to config file')
10 |     args = parser.parse_args()
11 |     with open(args.config_file, 'r') as f:
12 |         config = json.load(f)
13 |     config = DataGenConfig.from_dict(config)
14 |     config.config_path = args.config_file
15 |     data_gen = DataGen(config)
16 |     data_gen.generate()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.13.0
 2 | antlr4-python3-runtime==4.8
 3 | attrs==20.3.0
 4 | cachetools==4.2.2
 5 | certifi==2021.5.30
 6 | cffi==1.14.5
 7 | chardet==4.0.0
 8 | click==7.1.2
 9 | Cython==0.29.22
10 | dataclasses==0.6
11 | dirhash==0.2.1
12 | fairseq==0.10.2
13 | gitdb==4.0.6
14 | GitPython==3.1.14
15 | google-auth==1.32.0
16 | google-auth-oauthlib==0.4.4
17 | grpcio==1.38.1
18 | hydra-core==1.0.6
19 | idna==2.10
20 | importlib-metadata==4.5.0
21 | importlib-resources==5.1.2
22 | jieba==0.42.1
23 | joblib==1.0.1
24 | Markdown==3.3.4
25 | numpy==1.20.1
26 | oauthlib==3.1.1
27 | omegaconf==2.0.6
28 | pandas==1.2.3
29 | pathspec==0.8.1
30 | portalocker==2.0.0
31 | protobuf==3.17.3
32 | pyasn1==0.4.8
33 | pyasn1-modules==0.2.8
34 | pycparser==2.20
35 | Pygments==2.8.1
36 | python-dateutil==2.8.1
37 | pytz==2021.1
38 | PyYAML==5.4.1
39 | regex==2021.3.17
40 | requests==2.25.1
41 | requests-oauthlib==1.3.0
42 | rsa==4.7.2
43 | sacrebleu==1.5.1
44 | sacremoses==0.0.45
45 | scantree==0.0.1
46 | six==1.15.0
47 | smmap==3.0.5
48 | tensorboard==2.5.0
49 | tensorboard-data-server==0.6.1
50 | tensorboard-plugin-wit==1.8.0
51 | tokenizers==0.10.2
52 | torch==1.8.1
53 | tqdm==4.59.0
54 | typing-extensions==3.7.4.3
55 | urllib3==1.26.5
56 | Werkzeug==2.0.1
57 | zipp==3.4.1
58 | 


--------------------------------------------------------------------------------
/scripts/corr.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | from astropy.table import Table
 4 | import re
 5 | from scipy.stats import pearsonr
 6 | 
 7 | DATA = 'output/table.tex'
 8 | 
 9 | if __name__ == '__main__':
10 |     with open(DATA, 'r') as f:
11 |         content = f.read()
12 |     content = content.lower().replace('\\cmark', '1').replace('\\xmark', '0')
13 |     content = re.sub(r'(\\textsc{)(\w+)(})', '\\2', content)
14 |     with open(DATA + '.tmp.tex', 'w') as f:
15 |         f.write(content)
16 |     df = Table.read(DATA + '.tmp.tex').to_pandas()
17 |     print(df[df.columns[1:]].corr()['io'][:])
18 | 
19 |     import numpy as np
20 | 
21 |     rho = df[df.columns[1:]].corr()
22 |     pval = df[df.columns[1:]].corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
23 |     p = pval.applymap(lambda x: ''.join(['*' for t in [0.01, 0.05, 0.1] if x <= t]))
24 |     p = pval.applymap(lambda x: ('*' if x <= 0.05 else '') + f' ({x:.2E})')
25 |     print((rho.round(3).astype(str) + p)['io'])
26 | 


--------------------------------------------------------------------------------
/scripts/create-infer-jobs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | for CONFIG in infer-config-16k-ch5.json  infer-config-250k-ch5.json  infer-config-4k-ch5.json  infer-config-base-ch10.json  infer-config-base-ch5.json  infer-config-big-ch5.json  infer-config-small-ch5.json  infer-config-wd-ch5.json
4 | do
5 | 	python job-gen.py --system calcula --config-path configs/infer_configs/$CONFIG --job-type infer
6 | done


--------------------------------------------------------------------------------
/scripts/extract-gen-eval-syntax.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Usage: bash scripts/extract-gen-eval-syntax.sh PATH_TO_FAIRSEQ_GENERATE_LOG
 4 | 
 5 | NAME=$1
 6 | 
 7 | grep "^H" $NAME | awk 'BEGIN {FS="\t"}; {print $3}' | sed -r 's/(##)//g' > $NAME.detok
 8 | 
 9 | venv8/bin/python evaluate-syntax.py --system-output-path $NAME.detok
10 | 


--------------------------------------------------------------------------------
/scripts/extract-gen.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | NAME=gen_3958
4 | 
5 | grep "^H" $NAME.log | awk 'BEGIN {FS="\t"}; {print $3}' | sed -r 's/(##)//g' > $NAME.detok.s
6 | 


--------------------------------------------------------------------------------
/scripts/format-results-each.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | NAME="experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-3f2d/eval-io-legacy-2021-08-05-1430-949b-6b75/eval-io-legacy-2021-08-05-1430-949b-6b75.log"
4 | grep $NAME -e "OK |" | sed 's/INFO:root://g'
5 | 


--------------------------------------------------------------------------------
/scripts/freq-errors.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | grep -A1  experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-3f2d/eval-io-legacy-2021-08-03-0138-5a1d-b5c0/eval-io-legacy-2021-08-03-0138-5a1d-b5c0.log -e "IO: N/A " | awk -F"Error: " '{print $2}' | sed '/experiments/d' | sort| uniq -c  | sort -k1 -n -r
4 | 


--------------------------------------------------------------------------------
/scripts/gen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cwd=$(pwd)
4 | for example in synthesis-eval/examples/*
5 | do
6 | 	cd $example && python3 gen.py
7 | 	cd $cwd
8 | done
9 | 


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-16k-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_16k_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad"
 8 | CHECKPOINTS_PATH="experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad/model_16k/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-250k-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_250k_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294"
 8 | CHECKPOINTS_PATH="output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294/model250k/checkpoints/checkpoint5.pt"
 9 | 
10 | 
11 | source ~/.bashrc
12 | source venv/bin/activate
13 | 
14 | fairseq-generate $DATA_PATH \
15 |     -s c -t s --path $CHECKPOINTS_PATH \
16 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-4k-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_4k_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039"
 8 | CHECKPOINTS_PATH="experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039/model_4k/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-base-chpt10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_base_chkpt10_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/baseline4-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint10.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-base-chpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_base_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/baseline4-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-big-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_big_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/big-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-med-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_med_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/med-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-small-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_small_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/small-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval-wd-chkpt5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=slurm_logs/gen_eval_wd_chkpt5_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f"
 8 | CHECKPOINTS_PATH="runs/wd0.01-AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/checkpoints/checkpoint5.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 16 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm-eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=logs/gen_eval_%j.log
 6 | 
 7 | DATA_PATH="data/AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045"
 8 | CHECKPOINTS_PATH="runs/baseline-AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045/checkpoints/checkpoint_best.pt"
 9 | 
10 | source ~/.bashrc
11 | source venv/bin/activate
12 | 
13 | fairseq-generate $DATA_PATH \
14 |     -s c -t s --path $CHECKPOINTS_PATH \
15 |     --batch-size 128 --beam 5 --remove-bpe=' ##'


--------------------------------------------------------------------------------
/scripts/generate-slurm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --gres=gpu:1
3 | #SBATCH --mem=20G # Memory
4 | #SBATCH --ignore-pbs
5 | #SBATCH --output=logs/gen_%j.log
6 | 
7 | source venv/bin/activate
8 | 
9 | cat data/AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045/test.tok.c | fairseq-interactive data/AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045 -s c -t s --path runs/baseline-AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045/checkpoints/checkpoint_best.pt


--------------------------------------------------------------------------------
/scripts/get-anghabench-data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | git clone https://github.com/brenocfg/AnghaBench
4 | 


--------------------------------------------------------------------------------
/scripts/get-synthesis-benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | git clone https://github.com/mob-group/synthesis-eval
4 | 


--------------------------------------------------------------------------------
/scripts/intersections.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 4 | 
 5 | BEST_MODEL="experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-3f2d/eval-io-legacy-2021-08-03-0138-5a1d-b5c0/eval-io-legacy-2021-08-03-0138-5a1d-b5c0.log"
 6 | 
 7 | ALL="experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-6b96/eval-io-legacy-2021-08-03-0121-5a1d-f263/eval-io-legacy-2021-08-03-0121-5a1d-f263.log
 8 |        experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-6463/eval-io-legacy-2021-08-03-0117-5a1d-675e/eval-io-legacy-2021-08-03-0117-5a1d-675e.log
 9 |        data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-07ce/eval-io-legacy-2021-08-03-0111-5a1d-1a20/eval-io-legacy-2021-08-03-0111-5a1d-1a20.log
10 |        experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-3a2d/eval-io-legacy-2021-08-03-0127-5a1d-1000/eval-io-legacy-2021-08-03-0127-5a1d-1000.log
11 |        experiments/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-07-26-0621-c473-3f2d/eval-io-legacy-2021-08-03-0138-5a1d-b5c0/eval-io-legacy-2021-08-03-0138-5a1d-b5c0.log
12 |        experiments/AnghaBench-supervised-2021-07-25-1758-5a69ccfe0ea3eaf9fd9a09bff9555d9a-7b23-e039/infer-beam5-topn5-2021-07-28-1408-a148-80dc/eval-io-legacy-2021-08-03-0142-5a1d-941e/eval-io-legacy-2021-08-03-0142-5a1d-941e.log
13 |        output/AnghaBench-supervised-2021-07-24-0506-4df2227e3af912140f3c3cdb37f8dcb6-a767-3294/infer-beam5-topn5-2021-07-26-0621-c473-2a5d/eval-io-legacy-2021-08-03-0148-5a1d-a23b/eval-io-legacy-2021-08-03-0148-5a1d-a23b.log
14 |        experiments/AnghaBench-supervised-2021-07-25-1628-5a69ccfe0ea3eaf9fd9a09bff9555d9a-17e5-b3ad/infer-beam5-topn5-2021-07-28-1406-84ea-84b7/eval-io-legacy-2021-08-03-0152-5a1d-670a/eval-io-legacy-2021-08-03-0152-5a1d-670a.log
15 |        data/AnghaBench-supervised-2021-05-09-1331-1ad3533bfc9df19cc9bff3ba72487765-3840-f78f/infer-beam5-topn5-2021-08-05-1320-56d3-bb77/eval-io-legacy-2021-08-05-1234-e80d-64f0/eval-io-legacy-2021-08-05-1234-e80d-64f0.log"
16 | 
17 | grep $BEST_MODEL -e "IO = OK" | awk -F  ":" '{print $3}' > $SCRIPT_DIR/best_ok.txt
18 | 
19 | for path in $ALL
20 |   do
21 |   echo $path
22 |   grep $path -e "IO = OK" | awk -F  ":" '{print $3}' | comm -12 - $SCRIPT_DIR/best_ok.txt | wc -l
23 |   done


--------------------------------------------------------------------------------
/scripts/train/train-baseline-supervised-slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gres=gpu:1
 3 | #SBATCH --mem=20G # Memory
 4 | #SBATCH --ignore-pbs
 5 | #SBATCH --output=baseline_%j.log
 6 | 
 7 | source venv/bin/activate
 8 | 
 9 | DATA_PATH="data/AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045"
10 | RUN="runs/baseline-AnghaBench-supervised-2021-04-12-2011-1ad3533bfc9df19cc9bff3ba72487765-1f9a-5045"
11 | 
12 | mkdir -p $RUN
13 | 
14 | fairseq-train \
15 |     $DATA_PATH \
16 |     --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
17 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
18 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
19 |     --dropout 0.3 --weight-decay 0.0001 \
20 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
21 |     --max-tokens 4096 \
22 |     --eval-bleu \
23 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
24 |     --eval-bleu-detok moses \
25 |     --eval-bleu-remove-bpe \
26 |     --eval-bleu-print-samples \
27 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
28 |     --save-dir $RUN/checkpoints
29 | 


--------------------------------------------------------------------------------
/scripts/train/train-baseline-supervised.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | source venv/bin/activate
 4 | 
 5 | DATA_PATH="data/AnghaBench-supervised-2021-04-12-1045-1ad3533bfc9df19cc9bff3ba72487765-71fb-f770"
 6 | RUN="runs/baseline-AnghaBench-supervised-2021-04-12-1045-1ad3533bfc9df19cc9bff3ba72487765-71fb-f770"
 7 | 
 8 | mkdir -p $RUN
 9 | 
10 | CUDA_VISIBLE_DEVICES=0,1 fairseq-train \
11 |     $DATA_PATH \
12 |     --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
13 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
14 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
15 |     --dropout 0.3 --weight-decay 0.0001 \
16 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
17 |     --max-tokens 4096 \
18 |     --eval-bleu \
19 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
20 |     --eval-bleu-detok moses \
21 |     --eval-bleu-remove-bpe \
22 |     --eval-bleu-print-samples \
23 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --tensorboard-logdir $RUN/tb \
24 |     --save-dir $RUN/checkpoints >> $RUN/train.log


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Get data
 4 | bash scripts/get-anghabench-data.sh
 5 | 
 6 | # Get benchmark
 7 | 
 8 | bash scripts/get-synthesis-benchmark.sh
 9 | 
10 | # Install dependencies
11 | python3.7 -m venv venv
12 | source venv/bin/activate
13 | python -m pip install -r requirements.txt
14 | apt install gcc
15 | 


--------------------------------------------------------------------------------
/slurm_logs/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jordiae/neural-compilers/e1b109056730f9cf8b234293a5e4f8056b20727c/slurm_logs/.keep


--------------------------------------------------------------------------------