├── src
    ├── deepcoder_utils.py
    ├── train.py
    ├── dsl.py
    ├── program_generator.py
    ├── inference.py
    ├── dataset.py
    ├── model.py
    ├── program_simplifier.py
    └── generate_dataset.py
├── images
    └── inference.png
├── bin
    ├── lint.bash
    ├── init.bash
    └── run-jupyter.bash
├── .gitmodules
├── examples
    └── medium
    │   ├── trained-model
    │       ├── model.npz
    │       └── model-shape.pickle
    │   ├── generate_dataset.ipynb
    │   └── train_w0_0.25.ipynb
├── requirements.txt
├── .gitignore
├── test
    ├── test_dsl.py
    ├── test_program_generator.py
    ├── test_dataset.py
    ├── test_inference.py
    ├── test_model.py
    ├── test_generate_dataset.py
    └── test_program_simplifier.py
├── README.md
├── inference.ipynb
├── validate_baseline.ipynb
├── inspect_model.ipynb
├── generate_dataset.ipynb
├── train.ipynb
├── inspect_validation_results.ipynb
└── inspect_dataset.ipynb


/src/deepcoder_utils.py:
--------------------------------------------------------------------------------
1 | from DeepCoder_Utils import generate_io_samples
2 | 


--------------------------------------------------------------------------------
/images/inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiroakiMikami/deep-coder/HEAD/images/inference.png


--------------------------------------------------------------------------------
/bin/lint.bash:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | find ./ ./src ./test -maxdepth 1 -name "*.py" | xargs -n 1 autopep8 --in-place
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "DeepCoder_Utils"]
2 | 	path = DeepCoder_Utils
3 | 	url = https://github.com/HiroakiMikami/DeepCoder-Utils.git
4 | 


--------------------------------------------------------------------------------
/examples/medium/trained-model/model.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiroakiMikami/deep-coder/HEAD/examples/medium/trained-model/model.npz


--------------------------------------------------------------------------------
/examples/medium/trained-model/model-shape.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiroakiMikami/deep-coder/HEAD/examples/medium/trained-model/model-shape.pickle


--------------------------------------------------------------------------------
/bin/init.bash:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | jupyter serverextension enable --py jupyter_http_over_ws
4 | jupyter nbextension enable --py --sys-prefix widgetsnbextension
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | autopep8~=1.4.4
 2 | numpy~=1.17.0
 3 | docopt~=0.6.2
 4 | tqdm~=4.34.0
 5 | matplotlib~=3.1.1
 6 | pandas~=0.25.1
 7 | chainer~=6.3.0
 8 | jupyter~=1.0.0
 9 | jupyter-http-ver-ws~=0.0.6
10 | 


--------------------------------------------------------------------------------
/bin/run-jupyter.bash:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -u
 4 | 
 5 | jupyter notebook \
 6 |   --NotebookApp.allow_origin='https://colab.research.google.com' \
 7 |   --port=${1:-8888} \
 8 |   --NotebookApp.port_retries=0 \
 9 |   --no-browser
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !*/
 3 | !/.gitignore
 4 | !/README.md
 5 | !/images/*.png
 6 | !/DeepCoder_Utils
 7 | !/requirements.txt
 8 | !/*.ipynb
 9 | !/src/*.py
10 | !/test/*.py
11 | !/bin/*.bash
12 | !/examples/**/*.ipynb
13 | !/examples/**/trained-model/*.npz
14 | !/examples/**/trained-model/*.pickle
15 | 


--------------------------------------------------------------------------------
/test/test_dsl.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from src.dsl import Function, Type, Variable, Expression, Program, Signature, Statement
 4 | 
 5 | 
 6 | class Test_Program(unittest.TestCase):
 7 |     def test_to_string(self):
 8 |         self.assertEqual("a <- int\nb <- [int]\n",
 9 |                          Program([Variable(0, Type.Int), Variable(1, Type.IntList)], []).to_string())
10 |         F = Function("FUNC", Signature([Type.Int], Type.IntList))
11 |         self.assertEqual("a <- int\nb <- [int]\nc <- FUNC b a\n", Program(
12 |             [Variable(0, Type.Int), Variable(1, Type.IntList)],
13 |             [Statement(Variable(2, Type.Int), Expression(
14 |                 F, [Variable(1, Type.Int), Variable(0, Type.Int)]))]
15 |         ).to_string())
16 | 
17 |     def test_clone(self):
18 |         F = Function("FUNC", Signature([Type.Int, Type.IntList], Type.IntList))
19 |         a = Variable(0, Type.Int)
20 |         b = Variable(1, Type.IntList)
21 |         c = Variable(1, Type.IntList)
22 |         p = Program([a, b], [Statement(c, Expression(F, [a, b]))])
23 |         p_clone = p.clone()
24 |         self.assertEqual(p, p_clone)
25 | 
26 |         p_clone.inputs[0].id = 2
27 |         self.assertEqual(0, p.inputs[0].id)
28 |         self.assertEqual(0, p_clone.body[0].expression.arguments[0].id)
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import numpy as np
 3 | from typing import List, Union, Dict, Set
 4 | import chainer as ch
 5 | from chainer import training
 6 | from chainer.training import extensions
 7 | from chainer import cuda
 8 | from .model import ModelShapeParameters, Predictor, TrainingClassifier
 9 | 
10 | 
11 | def convert_entry(batch, device):
12 |     if device is None:
13 |         def to_device(x):
14 |             return x
15 |     elif device < 0:
16 |         to_device = cuda.to_cpu
17 |     else:
18 |         def to_device(x):
19 |             return cuda.to_gpu(x, device, cuda.Stream.null)
20 | 
21 |     return (to_device(np.array([types for types, _, _ in batch])),
22 |             to_device(np.array([values for _, values, _ in batch])),
23 |             np.array([attribute for _, _, attribute in batch]))
24 | 
25 | 
26 | class Training:
27 |     """
28 |     Store the instances for training
29 | 
30 |     Attributes
31 |     ----------
32 |     predictor : ch.Link
33 |         The attribute predictor of DeepCoder
34 |     model : ch.Link
35 |     trainer : training.Trainer
36 |     """
37 | 
38 |     def __init__(self,
39 |                  train_iter, test_iter, out: str,
40 |                  params: ModelShapeParameters, w_0: float,
41 |                  num_epochs: int, optimizer=ch.optimizers.Adam(), device=-1):
42 |         """
43 |         Constructor
44 | 
45 |         Parameters
46 |         ----------
47 |         train_iter : iterator
48 |             The iterator of the training dataset
49 |         test_iter : iterator or None
50 |             The iterator of the est dataset
51 |         out : str
52 |             The path of the output directory
53 |         params : ModelShapeParames
54 |         w_0 : float
55 |         The weight for label=False
56 |         num_epochs : int
57 |             The number of epochs
58 |         optimizer
59 |         device : int
60 |             The device used for training
61 |         """
62 | 
63 |         self.predictor = Predictor(params)
64 |         self.model = TrainingClassifier(self.predictor, w_0)
65 |         opt = optimizer.setup(self.model)
66 |         updater = training.StandardUpdater(
67 |             train_iter, optimizer, device=device, converter=convert_entry)
68 |         self.trainer = training.Trainer(
69 |             updater, (num_epochs, "epoch"), out=out)
70 |         if test_iter is not None:
71 |             self.trainer.extend(extensions.Evaluator(
72 |                 test_iter, self.model, device=device, converter=convert_entry))
73 | 


--------------------------------------------------------------------------------
/test/test_program_generator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from src.dsl import Function, Type, Variable, Expression, Program, Signature
 5 | from src.program_generator import arguments, programs, random_programs, IdGenerator, Variable, Type
 6 | 
 7 | 
 8 | class Test_program_generator(unittest.TestCase):
 9 |     def test_arguments(self):
10 |         g = IdGenerator()
11 |         args = list(arguments(g, set([Variable(g.generate(), Type.Int), Variable(
12 |             g.generate(), Type.IntList)]), [Type.Int, Type.IntList]))
13 |         """
14 |         [v(0), v(1)]
15 |         [v(0), v_new]
16 |         [v_new, v(1)]
17 |         [v_new1, v_new2]
18 |         """
19 |         self.assertEqual(4, len(args))
20 |         self.assertEqual(2, g.generate())
21 | 
22 |     def test_arguments_if_arguments_with_same_type(self):
23 |         g = IdGenerator()
24 |         args = list(arguments(g, set(), [Type.Int, Type.Int]))
25 |         """
26 |         [v_new, v_new]
27 |         [v_new1, v_new2]
28 |         """
29 |         self.assertEqual(2, len(args))
30 |         self.assertEqual(0, g.generate())
31 | 
32 |     def test_arguments_if_no_existing_variables(self):
33 |         # No existing variable
34 |         g = IdGenerator()
35 |         args = list(arguments(g, set(), [Type.Int, Type.IntList]))
36 |         self.assertEqual(1, len(args))
37 |         self.assertEqual([Variable(0, Type.Int), Variable(
38 |             1, Type.IntList)], args[0].arguments)
39 |         self.assertEqual(
40 |             set([Variable(0, Type.Int), Variable(1, Type.IntList)]), args[0].variables)
41 |         self.assertEqual([Variable(0, Type.Int), Variable(
42 |             1, Type.IntList)], args[0].new_variables)
43 |         self.assertEqual(2, args[0].generator.generate())
44 |         self.assertEqual(0, g.generate())
45 | 
46 |     def test_programs(self):
47 |         TAKE = Function("TAKE", Signature(
48 |             [Type.Int, Type.IntList], Type.IntList))
49 |         HEAD = Function("HEAD", Signature([Type.IntList], Type.Int))
50 |         srcs = set(map(lambda x: x.to_string(),
51 |                        programs([TAKE, HEAD], 1, 1)))
52 |         self.assertEqual(
53 |             set(["a <- int\nb <- [int]\nc <- TAKE a b\n",
54 |                  "a <- [int]\nb <- HEAD a\n"]),
55 |             srcs
56 |         )
57 | 
58 |         srcs = list(programs([TAKE], 2, 2))
59 |         l = set(map(lambda x: len(x.body), srcs))
60 | 
61 |         self.assertEqual(set([2]), l)
62 | 
63 | 
64 | class Test_random_programs(unittest.TestCase):
65 |     def test_random_programs(self):
66 |         TAKE = Function("TAKE", Signature(
67 |             [Type.Int, Type.IntList], Type.IntList))
68 |         HEAD = Function("HEAD", Signature([Type.IntList], Type.Int))
69 |         l = []
70 |         for _, program in zip(range(100), random_programs([TAKE, HEAD], 1, 2, rng=np.random.RandomState(100))):
71 |             l.append(len(program.body))
72 |         self.assertTrue(min(l) >= 1)
73 |         self.assertTrue(max(l) >= 2)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/test/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import chainer as ch
 3 | import numpy as np
 4 | 
 5 | from src.dataset import Example, Entry, prior_distribution, primitive_encoding, attribute_encoding, examples_encoding, EncodedDataset, dataset_metadata, DatasetMetadata, Dataset
 6 | 
 7 | 
 8 | class Test_dataset(unittest.TestCase):
 9 |     def test_dataset_metadata(self):
10 |         e0 = Entry("HEAD", [Example([[10, 20]], 10)], dict(
11 |             [["HEAD", True], ["TAKE", False]]))
12 |         e1 = Entry("TAKE", [Example([1, [10, 20]], 10)], dict(
13 |             [["HEAD", False], ["TAKE", True]]))
14 |         dataset = ch.datasets.TupleDataset([e0, e1])
15 |         stats = dataset_metadata(dataset)
16 |         self.assertEqual(2, stats.max_num_inputs)
17 |         self.assertEqual(set(["HEAD", "TAKE"]), stats.symbols)
18 | 
19 |     def test_prior_distribution(self):
20 |         dataset = ch.datasets.TupleDataset([
21 |             Entry("", [], dict([["F1", True], ["F2", False]])),
22 |             Entry("", [], dict([["F1", True], ["F2", True]]))
23 |         ])
24 |         prior = prior_distribution(dataset)
25 |         self.assertAlmostEqual(1.0, prior["F1"])
26 |         self.assertAlmostEqual(0.5, prior["F2"])
27 | 
28 |     def test_primitive_encoding(self):
29 |         encoding = primitive_encoding(-10, DatasetMetadata(0, set([]), 256, 2))
30 |         self.assertEqual(0, encoding.t)
31 |         self.assertTrue(
32 |             np.all(np.array([-10 + 256, 512] == encoding.value_arr)))
33 | 
34 |         encoding = primitive_encoding(
35 |             [1, 2], DatasetMetadata(0, set([]), 256, 3))
36 |         self.assertEqual(1, encoding.t)
37 |         self.assertTrue(
38 |             np.all(np.array([257, 258, 512] == encoding.value_arr)))
39 | 
40 |     def test_attribute_encoding(self):
41 |         encoding = attribute_encoding(dict([
42 |             ["A", True],
43 |             ["B", False]]))
44 |         self.assertTrue(np.all(np.array([1, 0]) == encoding))
45 | 
46 |     def test_examples_encoding_if_num_inputs_is_too_large(self):
47 |         metadata = DatasetMetadata(0, set([]), 2, 2)
48 |         self.assertRaises(RuntimeError, lambda: examples_encoding(
49 |             [Example([1, [0, 1]], [0]), Example([0, [0, 1]], [])], metadata))
50 | 
51 |     def test_EncodedDataset_constructor(self):
52 |         dataset = ch.datasets.TupleDataset([
53 |             Entry("entry1", [Example(([10, 20, 30],), 10)],
54 |                   dict([["HEAD", True], ["SORT", False]])),
55 |             Entry(
56 |                 "entry2",
57 |                 [Example(([30, 20, 10],), [10, 20, 30])],
58 |                 dict([["HEAD", False], ["SORT", True]])
59 |             )
60 |         ])
61 | 
62 |         cdataset = EncodedDataset(
63 |             Dataset(dataset, DatasetMetadata(1, set(["HEAD", "SORT"]), 256, 5)))
64 |         [(types0, values0, attribute0),
65 |          (types1, values1, attribute1)] = list(cdataset)
66 | 
67 |         self.assertTrue(np.all([[[0, 1], [1, 0]]] == types0))
68 |         self.assertTrue(
69 |             np.all([[
70 |                 [266, 276, 286, 512, 512],
71 |                 [266, 512, 512, 512, 512]
72 |             ]] == values0))
73 |         self.assertTrue(np.all(np.array([1, 0]) == attribute0))
74 | 
75 |         self.assertTrue(np.all([[[0, 1], [0, 1]]] == types1))
76 |         self.assertTrue(
77 |             np.all([[
78 |                 [286, 276, 266, 512, 512],
79 |                 [266, 276, 286, 512, 512]
80 |             ]] == values1))
81 |         self.assertTrue(np.all(np.array([0, 1]) == attribute1))
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     unittest.main()
86 | 


--------------------------------------------------------------------------------
/test/test_inference.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import numpy as np
  4 | import chainer as ch
  5 | 
  6 | from src.dataset import Entry, Example, examples_encoding, DatasetMetadata
  7 | from src.deepcoder_utils import generate_io_samples
  8 | from src.model import ModelShapeParameters
  9 | from src.inference import search, predict_with_prior_distribution, predict_with_neural_network, InferenceModel
 10 | 
 11 | 
 12 | class Test_inferense(unittest.TestCase):
 13 |     def test_search(self):
 14 |         # example of access
 15 |         examples = [
 16 |             Example([2, [10, 20, 30]], 30),
 17 |             Example([1, [-10, 30, 40]], 30)
 18 |         ]
 19 | 
 20 |         def pred(examples):
 21 |             LINQ, _ = generate_io_samples.get_language(50)
 22 |             LINQ = [f for f in LINQ if not "IDT" in f.src]
 23 |             prob = dict()
 24 |             for function in LINQ:
 25 |                 for name in function.src.split(" "):
 26 |                     if name == "ACCESS":
 27 |                         prob[name] = 0.8
 28 |                     else:
 29 |                         prob[name] = 0.2
 30 |             return prob
 31 | 
 32 |         result = search(
 33 |             os.path.join(os.getcwd(), "DeepCoder_Utils",
 34 |                          "enumerative-search", "search"), 1000, 256,
 35 |             examples, 2, pred)
 36 | 
 37 |         self.assertTrue(result.is_solved)
 38 |         self.assertAlmostEqual(0.8, result.probabilities["ACCESS"])
 39 |         self.assertAlmostEqual(0.2, result.probabilities["HEAD"])
 40 |         self.assertEqual(1, result.explored_nodes)
 41 |         self.assertEqual(" %2 <- access %0 %1\n", result.solution)
 42 | 
 43 |     def test_search_when_pred_throws_error(self):
 44 |         # example that do not correspond to any programs
 45 |         examples = [
 46 |             Example([2, [10, 20, 30]], -255),
 47 |             Example([1, [-10, 30, 40]], -255)
 48 |         ]
 49 | 
 50 |         def pred(examples):
 51 |             raise RuntimeError("test")
 52 | 
 53 |         result = search(
 54 |             os.path.join(os.getcwd(), "DeepCoder_Utils",
 55 |                          "enumerative-search", "search"), 1000, 256,
 56 |             examples, 2, pred)
 57 | 
 58 |         self.assertFalse(result.is_solved)
 59 |         self.assertEqual(-1, result.explored_nodes)
 60 |         self.assertEqual(dict([]), result.probabilities)
 61 |         self.assertEqual("", result.solution)
 62 | 
 63 |     def test_search_with_invalid_examples(self):
 64 |         # example that do not correspond to any programs
 65 |         examples = [
 66 |             Example([2, [10, 20, 30]], -255),
 67 |             Example([1, [-10, 30, 40]], -255)
 68 |         ]
 69 | 
 70 |         def pred(examples):
 71 |             LINQ, _ = generate_io_samples.get_language(50)
 72 |             LINQ = [f for f in LINQ if not "IDT" in f.src]
 73 |             prob = dict()
 74 |             for function in LINQ:
 75 |                 for name in function.src.split(" "):
 76 |                     prob[name] = 1.0
 77 |             return prob
 78 | 
 79 |         result = search(
 80 |             os.path.join(os.getcwd(), "DeepCoder_Utils",
 81 |                          "enumerative-search", "search"), 1000, 256,
 82 |             examples, 2, pred)
 83 | 
 84 |         self.assertFalse(result.is_solved)
 85 |         self.assertEqual(-1, result.explored_nodes)
 86 |         self.assertEqual("", result.solution)
 87 | 
 88 |     def test_predict_with_prior_distribution(self):
 89 |         dataset = ch.datasets.TupleDataset([
 90 |             Entry("e0", [], dict([["MAP", True], ["HEAD", True]])),
 91 |             Entry("e1", [], dict([["MAP", False], ["HEAD", True]]))
 92 |         ])
 93 |         pred = predict_with_prior_distribution(dataset)
 94 |         prob = pred([])
 95 |         self.assertEqual(dict([["MAP", 0.5], ["HEAD", 1.0]]), prob)
 96 | 
 97 |     def test_predict_with_neural_network(self):
 98 |         examples = [
 99 |             Example([2, [10, 20, 30]], 30),
100 |             Example([1, [-10, 30, 40]], 30)
101 |         ]
102 |         metadata = DatasetMetadata(
103 |             2, set(["MAP", "HEAD"]), 256, 5)
104 |         model_shape = ModelShapeParameters(metadata, 3, 2, 10)
105 |         m = InferenceModel(model_shape)
106 |         pred = predict_with_neural_network(model_shape, m)
107 |         prob = pred(examples)
108 | 
109 |         encoding = examples_encoding(examples, metadata)
110 |         prob_dnn = m.model(np.array([encoding.types]), np.array(
111 |             [encoding.values])).array[0]
112 | 
113 |         self.assertAlmostEqual(prob_dnn[0], prob["HEAD"])
114 |         self.assertAlmostEqual(prob_dnn[1], prob["MAP"])
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | deep-coder
  2 | ===
  3 | 
  4 | This repository is a re-implementation of [DeepCoder](https://openreview.net/pdf?id=ByldLrqlx). DeepCoder can synthesize domain-specific programs from inputs/output examples.
  5 | 
  6 | 
  7 | Notice (Sep/1/2019)
  8 | ---
  9 | 
 10 | I rewrite the implementation from scratch. The previous implementation is in the [v0.0.0 tag](https://github.com/HiroakiMikami/deep-coder/tree/v0.0.0).
 11 | 
 12 | 
 13 | Requirements
 14 | ---
 15 | 
 16 | ### Colab
 17 | 
 18 | * Google account
 19 | 
 20 | ### Local Runtime
 21 | 
 22 | * Linux (I think the code works in macOSes, but I have not tested the code in a macOS)
 23 | * Python3 (`Python)
 24 | * `make`
 25 | * `g++`
 26 | 
 27 | 
 28 | Usage
 29 | ---
 30 | 
 31 | _*Warning*_
 32 | *The notebook in the `examples` directory will use Google Drive as data storage. Please be careful not to overwrite your data!*
 33 | 
 34 | ### Quickstart
 35 | 
 36 | `inference.ipynb` synthesizes the domain-specific language using pre-training model (`examples/medium/trained-model`). 
 37 | 
 38 | ![](images/inference.png)
 39 | 
 40 | 
 41 | ### Training
 42 | 
 43 | #### 1. Setup (if you use local runtimes)
 44 | 
 45 | ```bash
 46 | # Download this repository and DeepCoder-Utils
 47 | $ git clone https://github.com/HiroakiMikami/deep-coder
 48 | $ cd deep-coder
 49 | $ git submodule init
 50 | $ git submodule update
 51 | # Build the search tool
 52 | $ make -C DeepCoder_Utils/enumerative-search -j $(nproc)
 53 | # Install python modules
 54 | $ pip install -r requirements.txt
 55 | # Setup Jupyter notebooks to use local runtimes of Colab
 56 | $ ./bin/init.bash
 57 | ```
 58 | 
 59 | #### 2. Training
 60 | 
 61 | The notebooks in `examples/medium` directory show how to train DeepCoder.
 62 | Training consists of the following steps:
 63 | 
 64 | 1. generate the dataset (`examples/medium/generate_dataset.ipynb`)
 65 |     * It may take more than 1 hour in Colab.
 66 |     * In the above example, I used [a local runtime](https://research.google.com/colaboratory/local-runtimes.html) then uploaded the dataset file to the Google Drive (`DeepCoder/dataset/length_3`).
 67 | 2. generate the baseline result (`examples/medium/generate_baseline_results.ipynb`)
 68 | 3. train the DNN model by using the training dataset and validate the model (`examples/medium/train.ipynb`)
 69 | 4. compare the results of the DNN model with the baseline (`examples/medium/comparison_with_baseline.ipynb`)
 70 | 
 71 | ### Run unit tests
 72 | 
 73 | ```bash
 74 | $ python -m unittest discover test
 75 | ```
 76 | 
 77 | 
 78 | Result
 79 | ---
 80 | 
 81 | ### Small Scale Experiment
 82 | 
 83 | `examples/small/integer_embeddings.ipynb` shows the learned embedding of integers. The embedding was trained by using the dataset with length=1 programs and `E=2` model.
 84 | 
 85 | It does not show the clear trend shown in Figure 8 in the [paper](https://openreview.net/pdf?id=ByldLrqlx).
 86 | There are many possible causes (e.g., the procedure of dataset generation, training hyperparameters) and I don't know what the root cause of this difference is.
 87 | 
 88 | ### Medium Scale Experiment (Smaller-scale experiment in the paper)
 89 | 
 90 | |Timeout needed to solve                                                  |20%  |40%  |60%  |
 91 | |:---                                                                     |:--- |:--- |:--- |
 92 | |Baseline                                                                 |53ms |122ms|375ms|
 93 | |DeepCoder                                                                |5ms  |24ms |87ms |
 94 | |Speedup (this implementation)                                            |10.8x|5.0x |3.6x |
 95 | |Speedup (Table 1 in the [paper](https://openreview.net/pdf?id=ByldLrqlx))|62.2x|54.6x|31.5x|
 96 | 
 97 | The trained model speeds up the program synthesize. However, the performance of this implementation is worse than which of the paper. I think the reason for this difference is the same as the reason for the integer-embedding difference, but there is no basis.
 98 | 
 99 | The details of the results is in `examples/medium/comparison_with_baseline.ipynb`.
100 | 
101 | 
102 | ### Loss Function
103 | 
104 | The binary attribute that is predicted by DNN is heavily imbalanced because each program in the dataset contains only 1-3 functions. For example, the attribute of `a <- int | b <- [int] | c <- TAKE a b` contains 33 `False` and only 1 `True`.
105 | 
106 | I doubted that this imbalance decreases the performance of the DNN model, and introduced cost-sensitive loss function (`weighted_sigmoid_cross_entropy` in `src/model.py`).
107 | 
108 | However, I cannot see the performance improvement in the medium scale experiment. `examples/medium/loss_function_comparison.ipynb` shows the results of the training using the cost-sensitive loss function. `examples/medium/train_w0_{0.25|0.5|0.75}.ipynb` shows the training logs.
109 | 
110 | 
111 | Todos
112 | ---
113 | 
114 | * [ ] Investigate the difference from the original paper
115 | * [ ] Run the large scale experiment (train with the program length of `4` dataset)
116 | 


--------------------------------------------------------------------------------
/src/dsl.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from enum import Enum
  3 | from typing import List
  4 | import copy
  5 | from src.deepcoder_utils import generate_io_samples
  6 | 
  7 | 
  8 | class Type(Enum):
  9 |     Int = 1
 10 |     IntList = 2
 11 | 
 12 | 
 13 | @dataclasses.dataclass
 14 | class Signature:
 15 |     """
 16 |     The function signature of DSL programs
 17 | 
 18 |     Attributes
 19 |     ----------
 20 |     input_types: List[Type]
 21 |     output_type: Type
 22 |     """
 23 |     input_types: List[Type]
 24 |     output_type: Type
 25 | 
 26 |     def __eq__(self, rhs):
 27 |         return self.input_types == rhs.input_types and self.output_type == rhs.output_type
 28 | 
 29 |     def __hash__(self):
 30 |         return hash((tuple(self.input_types), self.output_type))
 31 | 
 32 | 
 33 | @dataclasses.dataclass
 34 | class Function:
 35 |     """
 36 |     The function of DSL programs
 37 | 
 38 |     Attributes
 39 |     ----------
 40 |     name : str
 41 |         The name of this function
 42 |     signature : Signature
 43 |         The signature of this function
 44 |     """
 45 |     name: str
 46 |     signature: Signature
 47 | 
 48 |     def __eq__(self, rhs):
 49 |         return self.name == rhs.name and self.signature == rhs.signature
 50 | 
 51 |     def __hash__(self):
 52 |         return hash((self.name, (tuple(self.signature.input_types), self.signature.output_type)))
 53 | 
 54 | 
 55 | @dataclasses.dataclass
 56 | class Variable:
 57 |     """
 58 |     The variable of DSL programs
 59 | 
 60 |     Attributes
 61 |     ----------
 62 |     id : int
 63 |         The identifier of this variable
 64 |     t : Type
 65 |         The type of this variable
 66 |     """
 67 | 
 68 |     id: int
 69 |     t: Type
 70 | 
 71 |     def __eq__(self, rhs):
 72 |         return self.id == rhs.id and self.t == rhs.t
 73 | 
 74 |     def __hash__(self):
 75 |         return hash((self.id, self.t))
 76 | 
 77 | 
 78 | @dataclasses.dataclass
 79 | class Expression:
 80 |     """
 81 |     The expression of DSL programs
 82 | 
 83 |     Attributes
 84 |     ----------
 85 |     function : Function
 86 |         The function that this expression calls
 87 |     arguments: list of Variable
 88 |         The arguments of the function call
 89 |     """
 90 | 
 91 |     function: Function
 92 |     arguments: List[Variable]
 93 | 
 94 | 
 95 | @dataclasses.dataclass
 96 | class Statement:
 97 |     """
 98 |     The statement of DSL
 99 | 
100 |     Attributes
101 |     ----------
102 |     variable : Variable
103 |         The variable defined in this statement
104 |     expression : Expression
105 |     """
106 | 
107 |     variable: Variable
108 |     expression: Expression
109 | 
110 | 
111 | @dataclasses.dataclass
112 | class Program:
113 |     """
114 |     The program of DSL
115 | 
116 |     Attributes
117 |     ----------
118 |     inputs : list of Variable
119 |         The input variables of this program
120 |     body : list of (Variable, Expression)
121 |         The body of this program.
122 |         The interpreter will execute an expression and store the result
123 |         to the variablefor each element of the list.
124 |     """
125 | 
126 |     inputs: List[Variable]
127 |     body: List[Statement]
128 | 
129 |     def to_string(self) -> str:
130 |         """
131 |         Return the source code of the program
132 | 
133 |         Returns
134 |         -------
135 |         code : string
136 |             The source code of this program
137 |         """
138 | 
139 |         code = ""
140 | 
141 |         def id_to_name(id: int) -> str:
142 |             name = ""
143 |             while True:
144 |                 x = id % 26
145 |                 id //= 26
146 |                 name += chr(x + ord('a'))
147 |                 if id == 0:
148 |                     break
149 |             return name
150 | 
151 |         for input in self.inputs:
152 |             code += "{} <- {}\n".format(id_to_name(input.id),
153 |                                         "int" if input.t == Type.Int else "[int]")
154 |         for statement in self.body:
155 |             code += "{} <- {} {}\n".format(id_to_name(statement.variable.id), statement.expression.function.name, " ".join(
156 |                 map(lambda x: id_to_name(x.id), statement.expression.arguments)))
157 | 
158 |         return code
159 | 
160 |     def clone(self):
161 |         """
162 |         Return the copy of the program
163 |         The self and the return value will not share any objects.
164 |         Thus we can freely modify the return value.
165 | 
166 |         Returns
167 |         -------
168 |         cloned_program : Program
169 |             The program that is same as this program
170 |         """
171 | 
172 |         inputs = []
173 |         body = []
174 |         for input in self.inputs:
175 |             inputs.append(copy.deepcopy(input))
176 |         for statement in self.body:
177 |             args = []
178 |             for arg in statement.expression.arguments:
179 |                 args.append(copy.deepcopy(arg))
180 |             body.append(Statement(copy.deepcopy(statement.variable),
181 |                                   Expression(statement.expression.function, args)))
182 | 
183 |         return Program(inputs, body)
184 | 
185 | 
186 | def to_function(f: generate_io_samples.Function) -> Function:
187 |     """
188 |     Convert from generate_io_samples.Function to dsl.Function
189 | 
190 |     Parameters
191 |     ----------
192 |     f : generate_io_samples.Function
193 |         The function that will be converted
194 | 
195 |     Returns
196 |     -------
197 |     Function
198 |         The converted Function instance
199 |     """
200 |     intype = []
201 |     for s in f.sig[:-1]:
202 |         intype.append(Type.Int if s == int else Type.IntList)
203 |     outtype = Type.Int if f.sig[-1] == int else Type.IntList
204 |     return Function(f.src, Signature(intype, outtype))
205 | 


--------------------------------------------------------------------------------
/src/program_generator.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import numpy as np
  3 | from typing import List, Set, Union
  4 | import copy
  5 | from .dsl import Function, Type, Variable, Expression, Program, Statement
  6 | 
  7 | 
  8 | class IdGenerator:
  9 |     """
 10 |     The class to generate a unique id
 11 | 
 12 |     Attributes
 13 |     ----------
 14 |     _n : int
 15 |         The next id
 16 |     """
 17 | 
 18 |     def __init__(self):
 19 |         self._n = 0
 20 | 
 21 |     def generate(self):
 22 |         """
 23 |         Return a unique id
 24 | 
 25 |         Returns
 26 |         -------
 27 |         id : int
 28 |             A generated unique id
 29 |         """
 30 |         id = self._n
 31 |         self._n += 1
 32 |         return id
 33 | 
 34 | 
 35 | @dataclasses.dataclass
 36 | class ArgumentWithState:
 37 |     arguments: List[Variable]
 38 |     generator: IdGenerator
 39 |     variables: Set[Variable]
 40 |     new_variables: List[Variable]
 41 | 
 42 | 
 43 | def arguments(id_generator: IdGenerator, variables: Set[Variable], signature):
 44 |     """
 45 |     Enumerate all arguments that match the signature
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     id_generator : IdGenerator
 50 |         The generator used to create new variables
 51 |     variables : set of Variable
 52 |         The set of variables that are currently defined
 53 |     signature : list of Type
 54 |         The signature of the arguments.
 55 | 
 56 |     Yields
 57 |     ------
 58 |     ArgumentWithState
 59 |         The argument list and the state that will be used to continue enumeration
 60 |     """
 61 | 
 62 |     # Perform DFS to enumerate arguments
 63 |     # Start from an empty list
 64 |     s = list([ArgumentWithState([], id_generator, variables, set())])
 65 |     while len(s) != 0:
 66 |         elem = s.pop()
 67 | 
 68 |         if len(elem.arguments) == len(signature):
 69 |             yield elem
 70 |         else:
 71 |             t_arg = signature[len(elem.arguments)]  # The type of the argument
 72 |             # Existing variables which type is t_arg
 73 |             candidates = [v for v in elem.variables if v.t == t_arg]
 74 | 
 75 |             for v in candidates:
 76 |                 # Use existing var
 77 |                 s.append(ArgumentWithState(
 78 |                     [*(elem.arguments), v], elem.generator, elem.variables, elem.new_variables))
 79 |             # Create new var
 80 |             generator_new = copy.deepcopy(elem.generator)
 81 |             v_new = Variable(generator_new.generate(), t_arg)
 82 |             arg_new = [*(elem.arguments), v_new]
 83 |             vars_new = set([*(elem.variables), v_new])
 84 |             s.append(ArgumentWithState(arg_new, generator_new,
 85 |                                        vars_new, [*(elem.new_variables), v_new]))
 86 | 
 87 | 
 88 | def programs(functions: List[Function], min_length: int, max_length: int):
 89 |     """
 90 |     Enumerate all programs which length is in [min_length:max_length]
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     functions : list of Function
 95 |         All functions that can be used in source code
 96 |     min_length : int
 97 |         The minimum length of programs
 98 |     max_length : int
 99 |         The maximum length of programs
100 | 
101 |     Yields
102 |     ------
103 |     Program
104 |         The program which length is in [min_length:max_length]
105 |     """
106 |     assert(min_length <= max_length)
107 | 
108 |     # Perform DFS to enumerate source code
109 |     # Start from a program with no expressions
110 |     s = [(Program([], []), IdGenerator())]
111 |     while len(s) != 0:
112 |         p, g = s.pop()
113 | 
114 |         if min_length <= len(p.body) <= max_length:
115 |             yield p
116 |         if len(p.body) >= max_length:
117 |             continue
118 | 
119 |         # Create a set of variables
120 |         vars = set(p.inputs + list(map(lambda x: x.variable, p.body)))
121 | 
122 |         # Enumerate functions
123 |         for func in functions:
124 |             # Enumerate arguments
125 |             for a in arguments(g, vars, func.signature.input_types):
126 |                 p_new = copy.deepcopy(p)
127 |                 for v in a.new_variables:
128 |                     p_new.inputs.append(v)
129 |                 generator = copy.deepcopy(a.generator)
130 |                 p_new.body.append(
131 |                     Statement(Variable(generator.generate(), func.signature.output_type), Expression(func, a.arguments)))
132 |                 s.append((p_new, generator))
133 | 
134 | 
135 | def random_programs(functions: List[Function], min_length: int, max_length: int, rng: Union[None, np.random.RandomState] = None):
136 |     """
137 |     Generate random programs which length is in [min_length:max_length]
138 | 
139 |     Parameters
140 |     ----------
141 |     functions : list of Function
142 |         All functions that can be used in source code
143 |     min_length : int
144 |         The minimum length of programs
145 |     max_length : int
146 |         The maximum length of programs
147 |     rng : None or np.random.RandomState
148 |         The random number generator
149 | 
150 |     Yields
151 |     ------
152 |     Program
153 |         The program which length is in [min_length:max_length]
154 |     """
155 |     if rng is None:
156 |         rng = np.random
157 | 
158 |     while True:
159 |         assert(min_length <= max_length)
160 | 
161 |         # Decide the length of the program
162 |         length = rng.randint(min_length, max_length + 1)
163 | 
164 |         program = Program([], [])
165 |         generator = IdGenerator()
166 |         for i in range(length):
167 |             # Create a set of variables
168 |             vars = set(program.inputs +
169 |                        list(map(lambda x: x.variable, program.body)))
170 | 
171 |             # Decide the functions
172 |             func = rng.choice(functions)
173 | 
174 |             # Decide the arguments
175 |             arg = rng.choice(
176 |                 list(arguments(generator, vars, func.signature.input_types)))
177 | 
178 |             # Add the function call
179 |             for v in arg.new_variables:
180 |                 program.inputs.append(v)
181 |             generator = arg.generator
182 |             program.body.append(Statement(Variable(generator.generate(
183 |             ), func.signature.output_type), Expression(func, arg.arguments)))
184 | 
185 |         yield program
186 | 


--------------------------------------------------------------------------------
/test/test_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import unittest
  3 | import chainer as ch
  4 | import chainer.functions as F
  5 | 
  6 | from src.model import ExampleEmbed, Encoder, Decoder, TrainingClassifier, tupled_binary_accuracy
  7 | from src.dataset import Example, examples_encoding, DatasetMetadata
  8 | 
  9 | 
 10 | class Test_model(unittest.TestCase):
 11 |     def test_example_embed_embed_one_sample(self):
 12 |         embed = ExampleEmbed(1, 2, 1, (np.arange(5) + 1).reshape((5, 1)))
 13 |         self.assertEqual(1, len(list(embed.params())))
 14 |         """
 15 |         EmbedId
 16 |           0 (-2)   -> 1 
 17 |           1 (-1)   -> 2
 18 |           2 ( 0)   -> 3
 19 |           3 ( 1)   -> 4
 20 |           4 (NULL) -> 5
 21 |         """
 22 | 
 23 |         e = examples_encoding(
 24 |             [Example([[0, 1]], 0), Example([[1]], 1)],
 25 |             DatasetMetadata(1, set([]), 2, 2)
 26 |         )
 27 | 
 28 |         state_embeddings = embed.forward(
 29 |             np.array([e.types]), np.array([e.values]))
 30 |         self.assertEqual((1, 2, 2, 2 + 2 * 1), state_embeddings.shape)
 31 |         self.assertTrue(np.allclose(
 32 |             [0, 1, 3, 4], state_embeddings.array[0, 0, 0]))  # Input of e1
 33 |         self.assertTrue(np.allclose(
 34 |             [1, 0, 3, 5], state_embeddings.array[0, 0, 1]))  # Output of e1
 35 |         self.assertTrue(np.allclose(
 36 |             [0, 1, 4, 5], state_embeddings.array[0, 1, 0]))  # Input of e2
 37 |         self.assertTrue(np.allclose(
 38 |             [1, 0, 4, 5], state_embeddings.array[0, 1, 1]))  # Output of e2
 39 | 
 40 |         # backward does not throw an error
 41 |         state_embeddings.grad = np.ones(
 42 |             state_embeddings.shape, dtype=np.float32)
 43 |         state_embeddings.backward()
 44 | 
 45 |     # minibatch with mask
 46 |     def test_example_embed_embed_minibatch_with_different_number_of_inputs(self):
 47 |         embed = ExampleEmbed(2, 2, 1, (np.arange(5) + 1).reshape((5, 1)))
 48 |         """
 49 |         EmbedId
 50 |           0 (-2)   -> 1 
 51 |           1 (-1)   -> 2
 52 |           2 ( 0)   -> 3
 53 |           3 ( 1)   -> 4
 54 |           4 (NULL) -> 5
 55 |         """
 56 | 
 57 |         metadata = DatasetMetadata(2, set([]), 2, 2)
 58 |         e0 = examples_encoding(
 59 |             [Example([[0, 1]], 0), Example([[1]], 1)], metadata)
 60 |         e1 = examples_encoding(
 61 |             [Example([1, [0, 1]], [0]), Example([0, [0, 1]], [])], metadata)
 62 | 
 63 |         state_embeddings = embed.forward(
 64 |             np.array([e0.types, e1.types]), np.array([e0.values, e1.values]))
 65 |         self.assertEqual((2, 2, 3, 2 + 2 * 1), state_embeddings.shape)
 66 |         self.assertTrue(np.allclose(
 67 |             [0, 1, 3, 4], state_embeddings.array[0, 0, 0]))  # Input of e00
 68 |         self.assertTrue(np.allclose(
 69 |             [0, 0, 5, 5], state_embeddings.array[0, 0, 1]))  # Input of e00
 70 |         # Output of e00
 71 |         self.assertTrue(np.allclose(
 72 |             [1, 0, 3, 5], state_embeddings.array[0, 0, 2]))
 73 |         self.assertTrue(np.allclose(
 74 |             [0, 1, 4, 5], state_embeddings.array[0, 1, 0]))  # Input of e01
 75 |         self.assertTrue(np.allclose(
 76 |             [0, 0, 5, 5], state_embeddings.array[0, 1, 1]))  # Input of e01
 77 |         # Output of e01
 78 |         self.assertTrue(np.allclose(
 79 |             [1, 0, 4, 5], state_embeddings.array[0, 1, 2]))
 80 |         self.assertTrue(np.allclose(
 81 |             [1, 0, 4, 5], state_embeddings.array[1, 0, 0]))  # Input of e10
 82 |         self.assertTrue(np.allclose(
 83 |             [0, 1, 3, 4], state_embeddings.array[1, 0, 1]))  # Input of e10
 84 |         # Output of e10
 85 |         self.assertTrue(np.allclose(
 86 |             [0, 1, 3, 5], state_embeddings.array[1, 0, 2]))
 87 |         self.assertTrue(np.allclose(
 88 |             [1, 0, 3, 5], state_embeddings.array[1, 1, 0]))  # Input of e11
 89 |         self.assertTrue(np.allclose(
 90 |             [0, 1, 3, 4], state_embeddings.array[1, 1, 1]))  # Input of e11
 91 |         # Output of e11
 92 |         self.assertTrue(np.allclose(
 93 |             [0, 1, 5, 5], state_embeddings.array[1, 1, 2]))
 94 | 
 95 |     def test_Encoder(self):
 96 |         embed = ExampleEmbed(1, 2, 1, (np.arange(5) + 1).reshape((5, 1)))
 97 | 
 98 |         encoder = Encoder(1, initialW=ch.initializers.One(),
 99 |                           initial_bias=ch.initializers.Zero())
100 |         self.assertEqual(6, len(list(encoder.params())))
101 |         """
102 |         state_embeddings: (N, e, 2, 4) -> h1: (N, e, 1) -> h2: (N, e, 2) -> output: (N, e, 2)
103 |         """
104 | 
105 |         metadata = DatasetMetadata(1, set([]), 2, 2)
106 |         e = examples_encoding(
107 |             [Example([[0, 1]], 0), Example([[1]], 1)], metadata)
108 | 
109 |         state_embeddings = embed(np.array([e.types]), np.array([e.values]))
110 |         layer_encodings = encoder(state_embeddings)
111 | 
112 |         self.assertEqual((1, 2, 1), layer_encodings.shape)
113 |         for i in range(1):
114 |             for j in range(2):
115 |                 h = np.array(state_embeddings[i, j, :, :].array.sum())
116 |                 h = F.sigmoid(F.sigmoid(F.sigmoid(h)))
117 |                 self.assertEqual(h.array, layer_encodings.array[i, j])
118 | 
119 |     def test_Decoder(self):
120 |         initialW = np.ones((1, 2))
121 |         initial_bias = np.zeros((1,))
122 | 
123 |         decoder = Decoder(1, ch.initializers.One(), ch.initializers.Zero())
124 |         self.assertEqual(2, len(list(decoder.params())))
125 | 
126 |         input = np.zeros((1, 2, 2), dtype=np.float32)
127 |         input[0, 1, :] = 1.0
128 |         output = decoder(input)
129 |         """
130 |         [[0, 0], [1, 1]] =(pool)> [[0.5, 0.5]] =(linear)> [1] -> sigmoid
131 |         """
132 | 
133 |         self.assertEqual((1, 1), output.shape)
134 |         self.assertTrue(np.allclose(np.array([1.0]), output.array))
135 | 
136 |     def test_TrainingClassifier(self):
137 |         embed = ExampleEmbed(1, 2, 2)
138 |         encoder = Encoder(10)
139 |         decoder = Decoder(2)
140 |         classifier = TrainingClassifier(ch.Sequential(embed, encoder, decoder))
141 | 
142 |         metadata = DatasetMetadata(1, set([]), 2, 2)
143 |         e = examples_encoding(
144 |             [Example([[0, 1]], 0), Example([[1]], 1)], metadata)
145 |         labels = np.array([[1, 1]])
146 |         loss = classifier(np.array([e.types]), np.array([e.values]), labels)
147 |         loss.grad = np.ones(loss.shape, dtype=np.float32)
148 | 
149 |         # backward does not throw an error
150 |         loss.backward()
151 | 
152 |     def test_tupled_binary_accuracy(self):
153 |         acc = tupled_binary_accuracy(
154 |             np.array([-1.0, -1.0, -1.0, 1.0]), np.array([0, 0, 1, 1]))
155 |         self.assertAlmostEqual(1.0, acc[0].array)
156 |         self.assertAlmostEqual(0.5, acc[1].array)
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     unittest.main()
161 | 


--------------------------------------------------------------------------------
/src/inference.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import tempfile
  3 | import os
  4 | import numpy as np
  5 | import subprocess
  6 | import chainer as ch
  7 | import chainer.functions as F
  8 | from typing import List, Union, Dict, Callable, Set
  9 | from .dataset import Example, prior_distribution, examples_encoding
 10 | from .model import Predictor, ModelShapeParameters
 11 | 
 12 | 
 13 | class InferenceModel:
 14 |     """
 15 |     The model for inference
 16 | 
 17 |     Attributes
 18 |     ----------
 19 |     predictor : ch.Link
 20 |         The attribute predictor of DeepCoder
 21 |     model : ch.Link
 22 |         The model that outputs probabilities of each symbols
 23 |     """
 24 | 
 25 |     def __init__(self, params: ModelShapeParameters):
 26 |         """
 27 |         Constructor
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         params : ModelShapeParameters
 32 |         """
 33 |         self.predictor = Predictor(params)
 34 |         self.model = ch.Sequential(self.predictor, F.sigmoid)
 35 | 
 36 | 
 37 | @dataclasses.dataclass
 38 | class SearchResult:
 39 |     is_solved: bool
 40 |     probabilities: Dict[str, float]
 41 |     solution: str
 42 |     explored_nodes: int
 43 |     time_seconds: float
 44 | 
 45 | 
 46 | def search(search: str, timeout_second: int, value_range: int,
 47 |            examples: List[Example], max_program_length: int, pred: Callable[[List[Example]], Dict[str, float]]) -> SearchResult:
 48 |     """
 49 |     Search over program space and return the result of the search process
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     search : str
 54 |         The abosolute path of `search` command.
 55 |     timeout_second : int
 56 |         The timeout second
 57 |     value_range : int
 58 |         The largest absolute value used in the dataset.
 59 |     examples : List[Example]
 60 |         The I/O examples used in the search process.
 61 |         This function find the program that matches the I/O examples.
 62 |     max_program_length : int
 63 |         The maximum length of the program
 64 |     pred : Function from List[Example] to Dict[str, float]
 65 |         The predict function. It receives the examples as inputs, and returns
 66 |         the probabilities of functions and lambdas.
 67 | 
 68 |     Returns
 69 |     -------
 70 |     SearchResult
 71 |         The result of the search procedure
 72 |     """
 73 |     # Use temporary directory to conduct search
 74 |     with tempfile.TemporaryDirectory() as tmpdir:
 75 |         name = os.path.join(tmpdir, "data", "search")
 76 |         os.makedirs(name)
 77 | 
 78 |         # Dump {input|output}_{types|values}.txt
 79 |         with open(os.path.join(name, "input_types.txt"), "w") as f:
 80 |             intypes = []
 81 |             for input in examples[0].inputs:
 82 |                 inarr = np.array(input)
 83 |                 if inarr.shape == ():
 84 |                     # Int
 85 |                     intypes.append("Int")
 86 |                 else:
 87 |                     # IntList
 88 |                     intypes.append("Array")
 89 |             f.write(" ".join(intypes))
 90 |         with open(os.path.join(name, "input_values.txt"), "w") as f:
 91 |             values = []
 92 |             for example in examples:
 93 |                 value = []
 94 |                 for input in example.inputs:
 95 |                     inarr = np.array(input)
 96 |                     if inarr.shape == ():
 97 |                         value.append(str(input))
 98 |                     else:
 99 |                         value.append(" ".join(list(map(str, inarr))))
100 |                 values.append(" | ".join(value))
101 |             f.write("\n".join(values))
102 |         with open(os.path.join(name, "output_types.txt"), "w") as f:
103 |             outtypes = []
104 |             output = examples[0].output
105 |             outarr = np.array(output)
106 |             if outarr.shape == ():
107 |                 # Int
108 |                 outtypes.append("Int")
109 |             else:
110 |                 # IntList
111 |                 outtypes.append("Array")
112 |             f.write(" ".join(outtypes))
113 |         with open(os.path.join(name, "output_values.txt"), "w") as f:
114 |             values = []
115 |             for example in examples:
116 |                 value = []
117 |                 outarr = np.array(example.output)
118 |                 if outarr.shape == ():
119 |                     value.append(str(example.output))
120 |                 else:
121 |                     value.append(" ".join(list(map(str, outarr))))
122 |                 values.append(" | ".join(value))
123 |             f.write("\n".join(values))
124 | 
125 |         # Get probabilities
126 |         try:
127 |             prob = pred(examples)
128 |         except:
129 |             return SearchResult(False, dict([]), "", -1, -1)
130 | 
131 |         # Dump the probabilities to the file
132 |         with open(os.path.join(name, "prior.txt"), "w") as f:
133 |             probs = ["{} {}".format(p, name) for name, p in prob.items()]
134 |             f.write("\n".join(probs))
135 | 
136 |         # Execute search command
137 |         try:
138 |             res = subprocess.run(
139 |                 [search, "search", str(len(examples)), str(
140 |                     max_program_length), "0", "0", "-1", str(value_range)],
141 |                 stdout=subprocess.PIPE,
142 |                 timeout=timeout_second, cwd=tmpdir)
143 |             lines = res.stdout.decode().split("\n")
144 |         except subprocess.TimeoutExpired:
145 |             return SearchResult(False, prob, "", -1, timeout_second)
146 | 
147 |         for i, line in enumerate(lines):
148 |             if line == "Solved!":
149 |                 solution = "\n".join(lines[i + 4:])
150 |                 explored_nodes = int(
151 |                     lines[i + 1].replace("Nodes explored: ", ""))
152 |                 time = float(lines[i + 2])
153 |                 return SearchResult(
154 |                     True, prob, solution, explored_nodes, time)
155 | 
156 |         return SearchResult(False, prob, "", -1, -1)
157 | 
158 | 
159 | def predict_with_prior_distribution(dataset):
160 |     """
161 |     Predict by using the prior distribution of the dataset
162 | 
163 |     Parameters
164 |     ----------
165 |     dataset : chainer.dataset
166 |         The training dataset
167 | 
168 |     Returns
169 |     -------
170 |     function
171 |         The predict function
172 |     """
173 |     prior = prior_distribution(dataset)
174 |     return lambda x: prior
175 | 
176 | 
177 | def predict_with_neural_network(model_shape: ModelShapeParameters, model: InferenceModel):
178 |     """
179 |     Predict by using the neural network
180 | 
181 |     Parameters
182 |     ----------
183 |     model_shape : ModelShapeParameters
184 |         The parameters of the neural network model.
185 |         It is used to interpret the output of the neural network.
186 |     model : InferenceModel
187 |         The deep neural network model.
188 | 
189 |     Returns
190 |     -------
191 |     function
192 |         The predict function
193 |     """
194 |     def pred(examples: List[Example]):
195 |         encodings = examples_encoding(examples, model_shape.dataset_metadata)
196 |         pred = model.model(np.array([encodings.types]), np.array(
197 |             [encodings.values])).array[0]
198 |         retval = dict()
199 |         for name, p in zip(sorted(list(model_shape.dataset_metadata.symbols)), pred):
200 |             retval[name] = p
201 |         return retval
202 |     return pred
203 | 


--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder inference",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "yje9hqtcUQ_f",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "### Initialization\n",
 27 |         "* Check whether the runtime is host or local.\n"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  runtime = \"host\"\n",
 41 |         "except:\n",
 42 |         "  runtime = \"local\""
 43 |       ],
 44 |       "execution_count": 0,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "_S457sT6QMUr",
 51 |         "colab_type": "text"
 52 |       },
 53 |       "source": [
 54 |         "### Parameters"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "metadata": {
 60 |         "colab_type": "code",
 61 |         "id": "QN-4eF51DNqt",
 62 |         "colab": {}
 63 |       },
 64 |       "source": [
 65 |         "#@title Parameters\n",
 66 |         "#@markdown |Name            |Description|\n",
 67 |         "#@markdown |:---            |:---|\n",
 68 |         "#@markdown |`seed`|The random seed|\n",
 69 |         "seed = 3984 #@param {type: \"number\"}\n",
 70 |         "\n",
 71 |         "#@markdown ### `deep-coder` Repositories\n",
 72 |         "#@markdown |Name            |Description|\n",
 73 |         "#@markdown |:---            |:---|\n",
 74 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 75 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 76 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 77 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 78 |         "\n",
 79 |         "#@markdown ### Settings\n",
 80 |         "#@markdown |Name    |Description|\n",
 81 |         "#@markdown |:---    |:---|\n",
 82 |         "#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|\n",
 83 |         "device = 0 #@param {type: \"number\"}\n",
 84 |         "\n",
 85 |         "#@markdown ### URLs\n",
 86 |         "#@markdown |Name              |Description|\n",
 87 |         "#@markdown |:---              |:---|\n",
 88 |         "#@markdown |`model_shape_path`|The file path of the model shape.|\n",
 89 |         "#@markdown |`model_path`      |The file of the model parameters.|\n",
 90 |         "model_shape_path = \"./examples/medium/trained-model/model-shape.pickle\" #@param {type: \"string\"}\n",
 91 |         "model_path = \"./examples/medium/trained-model/model.npz\" #@param {type: \"string\"}\n"
 92 |       ],
 93 |       "execution_count": 0,
 94 |       "outputs": []
 95 |     },
 96 |     {
 97 |       "cell_type": "markdown",
 98 |       "metadata": {
 99 |         "id": "_BembldCdOO1",
100 |         "colab_type": "text"
101 |       },
102 |       "source": [
103 |         "### Setup\n",
104 |         "* Fix the random seed\n",
105 |         "* Download the codebase\n",
106 |         "  1. Clone git repository and move to the specified branch\n",
107 |         "  2. Initialize submodule\n",
108 |         "  3. Install chainer and cupy"
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "metadata": {
114 |         "id": "GwjlAkY1fR5j",
115 |         "colab_type": "code",
116 |         "colab": {}
117 |       },
118 |       "source": [
119 |         "import numpy as np\n",
120 |         "import random\n",
121 |         "\n",
122 |         "SEED_MAX = 2**32 - 1\n",
123 |         "\n",
124 |         "root_rng = np.random.RandomState(seed)\n",
125 |         "random.seed(root_rng.randint(SEED_MAX))\n",
126 |         "np.random.seed(root_rng.randint(SEED_MAX))"
127 |       ],
128 |       "execution_count": 0,
129 |       "outputs": []
130 |     },
131 |     {
132 |       "cell_type": "code",
133 |       "metadata": {
134 |         "id": "FIZJmuz8QFn_",
135 |         "colab_type": "code",
136 |         "colab": {}
137 |       },
138 |       "source": [
139 |         "if runtime == \"host\":\n",
140 |         "  %cd /content\n",
141 |         "  !rm -rf deep-coder\n",
142 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
143 |         "  %cd deep-coder\n",
144 |         "  !git checkout origin/$branch_name\n",
145 |         "  !git submodule init\n",
146 |         "  !git submodule update\n",
147 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc` CFLAGS=\"-DVERBOSE_MODE\"\n",
148 |         "  !curl https://colab.chainer.org/install | sh -\n",
149 |         "  !pip install tqdm"
150 |       ],
151 |       "execution_count": 0,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "cell_type": "markdown",
156 |       "metadata": {
157 |         "id": "Oz7sdzxUi70b",
158 |         "colab_type": "text"
159 |       },
160 |       "source": [
161 |         "### Examples"
162 |       ]
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "metadata": {
167 |         "id": "h7kdglcUjDTQ",
168 |         "colab_type": "code",
169 |         "colab": {}
170 |       },
171 |       "source": [
172 |         "inputs = [[1, [-1, 3, 2, 4, -5]], [0, [-2, 2, 3, 3]], [2, [1, 2, 3]], [1, [0, 1, 2]]] #@param {type: \"raw\"}\n",
173 |         "outputs = [-1, -2, 3, 1] #@param {type: \"raw\"}"
174 |       ],
175 |       "execution_count": 0,
176 |       "outputs": []
177 |     },
178 |     {
179 |       "cell_type": "markdown",
180 |       "metadata": {
181 |         "colab_type": "text",
182 |         "id": "4IOCX_PXG6sH"
183 |       },
184 |       "source": [
185 |         "### Run Inference"
186 |       ]
187 |     },
188 |     {
189 |       "cell_type": "code",
190 |       "metadata": {
191 |         "colab_type": "code",
192 |         "id": "SsVdGBe4G6sJ",
193 |         "colab": {}
194 |       },
195 |       "source": [
196 |         "import pickle\n",
197 |         "import os\n",
198 |         "import chainer as ch\n",
199 |         "from chainer import datasets\n",
200 |         "from src.dataset import EncodedDataset, Dataset\n",
201 |         "import src.inference as I\n",
202 |         "from src.model import ModelShapeParameters\n",
203 |         "from src.dataset import Example\n",
204 |         "from tqdm import tqdm_notebook as tqdm\n",
205 |         "\n",
206 |         "with open(model_shape_path, \"rb\") as f:\n",
207 |         "  model_shape = pickle.load(f)\n",
208 |         "\n",
209 |         "  model = I.InferenceModel(model_shape)\n",
210 |         "ch.serializers.load_npz(model_path, model.predictor)\n",
211 |         "\n",
212 |         "pred = I.predict_with_neural_network(model_shape, model)\n",
213 |         "\n",
214 |         "\n",
215 |         "examples = [Example(inputs, output) for inputs, output in zip(inputs, outputs)]\n",
216 |         "print(examples)\n",
217 |         "result = I.search(\n",
218 |         "    os.path.join(os.getcwd(), \"DeepCoder_Utils\",\n",
219 |         "                 \"enumerative-search\", \"search\"),\n",
220 |         "    100,\n",
221 |         "    model_shape.dataset_metadata.value_range,\n",
222 |         "    examples,\n",
223 |         "    2,\n",
224 |         "    pred)\n",
225 |         "\n",
226 |         "if result.is_solved:\n",
227 |         "    print(\"Time: {} sec\".format(result.time_seconds))\n",
228 |         "    print(\"#ExploredNodes: {}\".format(result.explored_nodes))\n",
229 |         "    print(result.solution)\n",
230 |         "else:\n",
231 |         "    print(\"Failed to synthesize\")\n"
232 |       ],
233 |       "execution_count": 0,
234 |       "outputs": []
235 |     }
236 |   ]
237 | }


--------------------------------------------------------------------------------
/test/test_generate_dataset.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import tempfile
  3 | import pickle
  4 | import os
  5 | import numpy as np
  6 | from src.deepcoder_utils import generate_io_samples
  7 | from src.dsl import Function, Type, Variable, Expression, Program
  8 | from src.dataset import DatasetMetadata
  9 | from src.generate_dataset import generate_dataset, DatasetSpec, EquivalenceCheckingSpec, IteratorDecorator
 10 | from src.program_simplifier import remove_redundant_variables
 11 | 
 12 | 
 13 | class Test_generate_dataset(unittest.TestCase):
 14 |     def test_generate_dataset(self):
 15 |         LINQ, _ = generate_io_samples.get_language(50)
 16 |         HEAD = [f for f in LINQ if f.src == "HEAD"][0]
 17 |         TAKE = [f for f in LINQ if f.src == "TAKE"][0]
 18 | 
 19 |         # Generate the program with the length of 1
 20 |         with tempfile.NamedTemporaryFile() as f:
 21 |             name = f.name
 22 |             generate_dataset([HEAD, TAKE], DatasetSpec(
 23 |                 50, 20, 5, 1, 1), EquivalenceCheckingSpec(1.0, 1, None), name)
 24 |             # Check the dataset
 25 |             srcs = set()
 26 |             with open(name, "rb") as fp:
 27 |                 d = pickle.load(fp)
 28 |                 dataset = d.dataset
 29 |                 metadata = d.metadata
 30 |                 for entry, in dataset:
 31 |                     srcs.add(entry.source_code)
 32 |                     p = generate_io_samples.compile(
 33 |                         entry.source_code, 50, 5)
 34 |                     self.assertNotEqual(None, p)
 35 |                     for example in entry.examples:
 36 |                         output = p.fun(example.inputs)
 37 |                         self.assertEqual(output, example.output)
 38 |             self.assertEqual(
 39 |                 set(["a <- int\nb <- [int]\nc <- TAKE a b", "a <- [int]\nb <- HEAD a"]), srcs)
 40 |             self.assertEqual(DatasetMetadata(
 41 |                 2, set(["TAKE", "HEAD"]), 50, 20), metadata)
 42 | 
 43 |         # Generate the program with the length of 2
 44 |         with tempfile.NamedTemporaryFile() as f:
 45 |             name = f.name
 46 | 
 47 |             def simplify(program):
 48 |                 program = remove_redundant_variables(program)
 49 |                 return program
 50 |             generate_dataset([HEAD, TAKE], DatasetSpec(
 51 |                 50, 20, 5, 2, 2), EquivalenceCheckingSpec(1.0, 1, None), name, simplify=simplify)
 52 | 
 53 |             # Check the dataset
 54 |             srcs = set()
 55 |             with open(name, "rb") as fp:
 56 |                 d = pickle.load(fp)
 57 |                 dataset = d.dataset
 58 |                 metadata = d.metadata
 59 |                 for entry, in dataset:
 60 |                     srcs.add(entry.source_code)
 61 |                     p = generate_io_samples.compile(
 62 |                         entry.source_code, 50, 5)
 63 |                     self.assertNotEqual(None, p)
 64 |                     for example in entry.examples:
 65 |                         output = p.fun(example.inputs)
 66 |                         self.assertEqual(output, example.output)
 67 |             self.assertEqual(set([
 68 |                 "a <- [int]\nb <- HEAD a\nc <- TAKE b a",
 69 |                 "a <- int\nb <- [int]\nc <- TAKE a b\nd <- TAKE a c",
 70 |                 "a <- int\nb <- [int]\nc <- int\nd <- TAKE a b\ne <- TAKE c d",
 71 |                 "a <- int\nb <- [int]\nc <- TAKE a b\nd <- HEAD c",
 72 |                 "a <- [int]\nb <- [int]\nc <- HEAD a\nd <- TAKE c b"
 73 |             ]), srcs)
 74 |             self.assertEqual(DatasetMetadata(
 75 |                 3, set(["TAKE", "HEAD"]), 50, 20), metadata)
 76 | 
 77 |     def test_generate_dataset_can_relax_equivalence_checking(self):
 78 |         LINQ, _ = generate_io_samples.get_language(50)
 79 |         HEAD = [f for f in LINQ if f.src == "HEAD"][0]
 80 |         LAST = [f for f in LINQ if f.src == "LAST"][0]
 81 | 
 82 |         # Generate the program with the length of 1
 83 |         with tempfile.NamedTemporaryFile() as f:
 84 |             name = f.name
 85 |             np.random.seed(0)
 86 |             generate_dataset([HEAD, LAST], DatasetSpec(
 87 |                 50, 20, 5, 1, 1), EquivalenceCheckingSpec(0, 1, None), name)
 88 |             # Check the dataset
 89 |             srcs = set()
 90 |             with open(name, "rb") as fp:
 91 |                 d = pickle.load(fp)
 92 |                 dataset = d.dataset
 93 |                 metadata = d.metadata
 94 |                 for entry, in dataset:
 95 |                     srcs.add(entry.source_code)
 96 |                     p = generate_io_samples.compile(
 97 |                         entry.source_code, 50, 5)
 98 |                     self.assertNotEqual(None, p)
 99 |                     for example in entry.examples:
100 |                         output = p.fun(example.inputs)
101 |                         self.assertEqual(output, example.output)
102 |             self.assertEqual(
103 |                 set(["a <- [int]\nb <- HEAD a", "a <- [int]\nb <- LAST a"]), srcs)
104 |             self.assertEqual(DatasetMetadata(
105 |                 1, set(["HEAD", "LAST"]), 50, 20), metadata)
106 | 
107 |     def test_generate_dataset_with_decorators(self):
108 |         LINQ, _ = generate_io_samples.get_language(50)
109 |         HEAD = [f for f in LINQ if f.src == "HEAD"][0]
110 |         LAST = [f for f in LINQ if f.src == "LAST"][0]
111 | 
112 |         class Decorator:
113 |             def __init__(self):
114 |                 self.items = []
115 |             def __call__(self, generator):
116 |                 for item in generator:
117 |                     self.items.append(item)
118 |                     yield item
119 |         program = Decorator()
120 |         entries = Decorator()
121 |         decorator = IteratorDecorator(program, entries)
122 | 
123 |         # Generate the program with the length of 1
124 |         with tempfile.NamedTemporaryFile() as f:
125 |             name = f.name
126 |             np.random.seed(0)
127 |             generate_dataset([HEAD, LAST], DatasetSpec(
128 |                 50, 20, 5, 1, 1), EquivalenceCheckingSpec(1, 1, None), name, decorator=decorator)
129 |             self.assertEqual(2, len(program.items))
130 |             self.assertEqual(1, len(entries.items))
131 | 
132 |     def test_generate_dataset_separate_higher_order_function_and_lambda(self):
133 |         LINQ, _ = generate_io_samples.get_language(50)
134 |         HEAD = [f for f in LINQ if f.src == "HEAD"][0]
135 |         MAP_INC = [f for f in LINQ if f.src == "MAP INC"][0]
136 | 
137 |         # Generate the program with the length of 1
138 |         with tempfile.NamedTemporaryFile() as f:
139 |             name = f.name
140 |             np.random.seed(0)
141 |             generate_dataset([HEAD, MAP_INC], DatasetSpec(
142 |                 50, 20, 5, 1, 1), EquivalenceCheckingSpec(1, 1, None), name)
143 |             # Check the dataset
144 |             attribute_keys = set()
145 |             with open(name, "rb") as fp:
146 |                 d = pickle.load(fp)
147 |                 dataset = d.dataset
148 |                 metadata = d.metadata
149 |                 for entry, in dataset:
150 |                     for symbol in entry.attribute.keys():
151 |                         attribute_keys.add(symbol)
152 |             self.assertEqual(set(["HEAD", "MAP", "INC"]), attribute_keys)
153 |             self.assertEqual(DatasetMetadata(
154 |                 1, set(["HEAD", "MAP", "INC"]), 50, 20), metadata)
155 | 
156 |     def test_generate_dataset_fix_number_of_dataset(self):
157 |         LINQ, _ = generate_io_samples.get_language(50)
158 |         HEAD = [f for f in LINQ if f.src == "HEAD"][0]
159 |         LAST = [f for f in LINQ if f.src == "LAST"][0]
160 |         MAXIMUM = [f for f in LINQ if f.src == "MAXIMUM"][0]
161 | 
162 |         # Generate the program with the length of 1
163 |         with tempfile.NamedTemporaryFile() as f:
164 |             name = f.name
165 |             np.random.seed(0)
166 |             generate_dataset([HEAD, LAST, MAXIMUM], DatasetSpec(
167 |                 50, 20, 5, 1, 1), EquivalenceCheckingSpec(1, 1, None), name, 2)
168 |             # Check the dataset
169 |             attribute_keys = set()
170 |             with open(name, "rb") as fp:
171 |                 d = pickle.load(fp)
172 |                 dataset = d.dataset
173 |                 metadata = d.metadata
174 |             self.assertEqual(2, len(dataset))
175 |             self.assertTrue(dataset[0][0].source_code != dataset[1][0].source_code)
176 | 
177 | if __name__ == "__main__":
178 |     unittest.main()
179 | 


--------------------------------------------------------------------------------
/src/dataset.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import numpy as np
  3 | import chainer as ch
  4 | from chainer import datasets
  5 | from typing import List, Union, Dict, Set
  6 | from .dsl import Function
  7 | 
  8 | Primitive = Union[int, List[int]]
  9 | 
 10 | 
 11 | @dataclasses.dataclass
 12 | class Example:
 13 |     """
 14 |     An I/O example
 15 | 
 16 |     Attributes
 17 |     ----------
 18 |     inputs : List[Primitive]
 19 |     output : Primitive
 20 |     """
 21 |     inputs: List[Primitive]
 22 |     output: Primitive
 23 | 
 24 | 
 25 | @dataclasses.dataclass
 26 | class Entry:
 27 |     """
 28 |     The entry of this dataset
 29 | 
 30 |     Attributes
 31 |     ----------
 32 |     source_code : str
 33 |         The source code of the program
 34 |     examples : list of Example
 35 |         The input/output examples for the source code
 36 |     attribute : dict from str to bool
 37 |         The binary attribute of the source code.
 38 |         The key represents the name of functions or lambdas (symbols).
 39 |         The value represents whether the program contains
 40 |         the function or not.
 41 |     """
 42 |     source_code: str
 43 |     examples: List[Example]
 44 |     attribute: Dict[str, bool]
 45 | 
 46 | 
 47 | @dataclasses.dataclass
 48 | class DatasetMetadata:
 49 |     max_num_inputs: int
 50 |     symbols: Set[str]
 51 |     value_range: int
 52 |     max_list_length: int
 53 | 
 54 | 
 55 | @dataclasses.dataclass
 56 | class Dataset:
 57 |     dataset: ch.datasets.TupleDataset
 58 |     metadata: DatasetMetadata
 59 | 
 60 | 
 61 | def dataset_metadata(dataset, value_range: int = -1, max_list_length: int = -1) -> DatasetMetadata:  # TODO
 62 |     """
 63 |     Return the values for specifying the model shape
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     dataset : chainer.dataset
 68 |     value_range : int
 69 |         The largest absolute value used in the dataset
 70 |     max_list_length: int
 71 | 
 72 |     Returns
 73 |     -------
 74 |     DatasetMetadata
 75 |     """
 76 |     num_inputs = 0
 77 |     symbols = set([])
 78 |     for entry in dataset:
 79 |         entry = entry[0]
 80 |         num_inputs = max(num_inputs, len(entry.examples[0].inputs))
 81 |         if len(symbols) == 0:
 82 |             for symbol in entry.attribute.keys():
 83 |                 symbols.add(symbol)
 84 |     return DatasetMetadata(num_inputs, symbols, value_range, max_list_length)
 85 | 
 86 | 
 87 | def prior_distribution(dataset) -> Dict[str, float]:
 88 |     """
 89 |     Return the prior distribution over functions
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     dataset : chainer.dataset
 94 |         The dataset to calculate the prior distribution.
 95 |         Each element of the dataset should be Tuple[Entry].
 96 | 
 97 |     Returns
 98 |     -------
 99 |     prior : Dict[str, float]
100 |         The value represents the frequency of the function or lambda in the dataset.
101 |     """
102 | 
103 |     prior: Dict[Function, float] = dict()
104 |     for entry in dataset:
105 |         entry = entry[0]
106 |         for symbol, value in entry.attribute.items():
107 |             if not symbol in prior:
108 |                 prior[symbol] = 0
109 |             prior[symbol] += 1 if value else 0
110 | 
111 |     for symbol in prior.keys():
112 |         prior[symbol] /= len(dataset)
113 | 
114 |     return prior
115 | 
116 | 
117 | @dataclasses.dataclass
118 | class PrimitiveEncoding:
119 |     """
120 |     A encoding of Primitive
121 | 
122 |     Attributes
123 |     ----------
124 |     t : int
125 |         It represents the type of the primitive.
126 |         0 means that the type is Int, and 1 means that the type is List[Int]
127 |     value_arr : np.array
128 |         The array of the values.
129 |         The empty elements are filled with Null value.
130 |     """
131 |     t: int
132 |     value_arr: np.array
133 | 
134 | 
135 | @dataclasses.dataclass
136 | class ExamplesEncoding:
137 |     """
138 |     A encoding of the list of Example
139 | 
140 |     Attributes
141 |     ----------
142 |     types : np.array
143 |         The encoding of inputs and output types.
144 |         The shape is (E, I + 1, 2) where
145 |             E is the number of examples and
146 |             I is the maximum number of inputs
147 |     values: np.array
148 |         The encoding of inputs and output values.
149 |         The shape is (E, I + 1, max_list_length) where
150 |             E is the number of examples,
151 |             I is the maximum number of inputs, and
152 |             max_list_length is the maximum length of the list.
153 |     """
154 |     types: np.array
155 |     values: np.array
156 | 
157 | 
158 | @dataclasses.dataclass
159 | class EntryEncoding:
160 |     """
161 |     A encoding of Entry
162 | 
163 |     Attributes
164 |     ----------
165 |     examples: ExamplesEncoding
166 |     attribute: np.array
167 |     """
168 |     examples: ExamplesEncoding
169 |     attribute: np.array
170 | 
171 | 
172 | def primitive_encoding(p: Primitive, metadata: DatasetMetadata) -> PrimitiveEncoding:
173 |     """
174 |     Parameters
175 |     ----------
176 |     p : Primitive
177 |         The primitive to encode
178 |     metadata : DatasetMetadata
179 | 
180 |     Returns
181 |     -------
182 |     PrimitiveEncoding
183 |         The encoding of the primitive
184 |     """
185 |     Null = metadata.value_range
186 |     arr = np.array(p)
187 | 
188 |     t = 0 if arr.shape == () else 1
189 |     value_arr = np.ones((metadata.max_list_length,)) * Null
190 |     value_arr[:arr.size] = arr
191 | 
192 |     # Add offset of value_range because the range of integers is [-value_range:value_range-1]
193 |     return PrimitiveEncoding(t, value_arr + metadata.value_range)
194 | 
195 | 
196 | def examples_encoding(examples: List[Example], metadata: DatasetMetadata) -> ExamplesEncoding:
197 |     E = len(examples)
198 |     I = metadata.max_num_inputs
199 |     max_list_length = metadata.max_list_length
200 |     Null = metadata.value_range * 2
201 | 
202 |     types = np.zeros((E, I + 1, 2), dtype=np.float32)
203 |     values = np.ones((E, I + 1, max_list_length), dtype=np.int32) * Null
204 |     for i, example in enumerate(examples):
205 |         if len(example.inputs) > I:
206 |             raise RuntimeError("The number of inputs ({}) exceeds the limits ({})".format(
207 |                 len(example.inputs), I))
208 |         enc_inputs = [primitive_encoding(ins, metadata)
209 |                       for ins in example.inputs]
210 |         enc_output = primitive_encoding(example.output, metadata)
211 |         types[i, :len(example.inputs), :] = [np.identity(2)[enc_input.t]
212 |                                              for enc_input in enc_inputs]
213 |         values[i, :len(example.inputs), :] = [
214 |             enc_input.value_arr for enc_input in enc_inputs]
215 |         types[i, I, :] = np.identity(2)[enc_output.t]
216 |         values[i, I, :] = enc_output.value_arr
217 | 
218 |     return ExamplesEncoding(types, values)
219 | 
220 | 
221 | def attribute_encoding(attribute: Dict[Function, bool]) -> np.array:
222 |     """
223 |     Parameters
224 |     ----------
225 |     attribute : Dict[Function, bool]
226 |         The binary attribute
227 | 
228 |     Returns
229 |     -------
230 |     PrimitiveEntry
231 |         The encoding of the entry
232 |     """
233 |     symbols = list(attribute.keys())
234 |     symbols = sorted(symbols)
235 |     arr = []
236 |     for symbol in symbols:
237 |         arr.append(1 if attribute[symbol] else 0)
238 | 
239 |     return np.array(arr, dtype=np.int32)
240 | 
241 | 
242 | def entry_encoding(entry: Entry, metadata: DatasetMetadata) -> EntryEncoding:
243 |     examples = examples_encoding(entry.examples, metadata)
244 |     attribute = attribute_encoding(entry.attribute)
245 |     return EntryEncoding(examples, attribute)
246 | 
247 | 
248 | class EncodedDataset(datasets.TransformDataset):
249 |     """
250 |     The dataset of the entry encodings for DeepCoder
251 |     This instance stores each entry as the tuple of
252 |     (the encoding of types, the encoding of values, the encoding of attribute).
253 |     """
254 | 
255 |     def __init__(self, dataset: Dataset):
256 |         """
257 |         Constructor
258 | 
259 |         Parameters
260 |         ----------
261 |         dataset : Dataset
262 |             The dataset and its metadata
263 |         """
264 | 
265 |         def transform(in_data):
266 |             entry = in_data[0]
267 |             encoding = entry_encoding(entry, dataset.metadata)
268 |             return encoding.examples.types, encoding.examples.values, encoding.attribute
269 | 
270 |         super(EncodedDataset, self).__init__(dataset.dataset, transform)
271 | 


--------------------------------------------------------------------------------
/validate_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder baseline",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  drive.mount('/gdrive')\n",
 41 |         "  runtime = \"host\"\n",
 42 |         "except:\n",
 43 |         "  runtime = \"local\""
 44 |       ],
 45 |       "execution_count": 0,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "_S457sT6QMUr",
 52 |         "colab_type": "text"
 53 |       },
 54 |       "source": [
 55 |         "### Parameters"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "metadata": {
 61 |         "colab_type": "code",
 62 |         "id": "QN-4eF51DNqt",
 63 |         "colab": {}
 64 |       },
 65 |       "source": [
 66 |         "#@title Parameters\n",
 67 |         "#@markdown |Name            |Description|\n",
 68 |         "#@markdown |:---            |:---|\n",
 69 |         "#@markdown |`seed`|The random seed|\n",
 70 |         "seed = 3984 #@param {type: \"number\"}\n",
 71 |         "\n",
 72 |         "#@markdown ### `deep-coder` Repositories\n",
 73 |         "#@markdown |Name            |Description|\n",
 74 |         "#@markdown |:---            |:---|\n",
 75 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 76 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 77 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 78 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 79 |         "\n",
 80 |         "#@markdown ### Validation Settings\n",
 81 |         "#@markdown |Name                |Description|\n",
 82 |         "#@markdown |:---                |:---|\n",
 83 |         "#@markdown |`timeout_second`    ||\n",
 84 |         "#@markdown |`max_program_length`|The maximum length of the program|\n",
 85 |         "timeout_second = 1 #@param {type: \"number\"}\n",
 86 |         "max_program_length = 2 #@param {type: \"number\"}\n",
 87 |         "\n",
 88 |         "#@markdown ### Filepath\n",
 89 |         "#@markdown |Name                |Description|\n",
 90 |         "#@markdown |:---                |:---|\n",
 91 |         "#@markdown |`train_dataset_path`|The file path of the training dataset.|\n",
 92 |         "#@markdown |`valid_dataset_path`|The file path of the validation dataset.|\n",
 93 |         "#@markdown |`destination_path`  |The directory of the directory that will contain the training results.|\n",
 94 |         "train_dataset_path = \"./dataset/train.pickle\" #@param {type: \"string\"}\n",
 95 |         "valid_dataset_path = \"./dataset/valid.pickle\" #@param {type: \"string\"}\n",
 96 |         "destination_path = \"./out/baseline\" #@param {type: \"string\"}\n",
 97 |         "\n"
 98 |       ],
 99 |       "execution_count": 0,
100 |       "outputs": []
101 |     },
102 |     {
103 |       "cell_type": "markdown",
104 |       "metadata": {
105 |         "id": "_BembldCdOO1",
106 |         "colab_type": "text"
107 |       },
108 |       "source": [
109 |         "### Setup\n",
110 |         "* Fix the random seed\n",
111 |         "* Download the codebase\n",
112 |         "  1. Clone git repository and move to the specified branch\n",
113 |         "  2. Initialize submodule\n",
114 |         "  3. Install chainer and cupy\n",
115 |         "* Copy the dataset from Google Drive"
116 |       ]
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "metadata": {
121 |         "id": "GwjlAkY1fR5j",
122 |         "colab_type": "code",
123 |         "colab": {}
124 |       },
125 |       "source": [
126 |         "import numpy as np\n",
127 |         "import random\n",
128 |         "\n",
129 |         "SEED_MAX = 2**32 - 1\n",
130 |         "\n",
131 |         "root_rng = np.random.RandomState(seed)\n",
132 |         "random.seed(root_rng.randint(SEED_MAX))\n",
133 |         "np.random.seed(root_rng.randint(SEED_MAX))"
134 |       ],
135 |       "execution_count": 0,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "cell_type": "code",
140 |       "metadata": {
141 |         "id": "FIZJmuz8QFn_",
142 |         "colab_type": "code",
143 |         "colab": {}
144 |       },
145 |       "source": [
146 |         "if runtime == \"host\":\n",
147 |         "  %cd /content\n",
148 |         "  !rm -rf deep-coder\n",
149 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
150 |         "  %cd deep-coder\n",
151 |         "  !git checkout origin/$branch_name\n",
152 |         "  !git submodule init\n",
153 |         "  !git submodule update\n",
154 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
155 |         "  !curl https://colab.chainer.org/install | sh -",
156 |         "  !pip install tqdm"
157 |       ],
158 |       "execution_count": 0,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "markdown",
163 |       "metadata": {
164 |         "id": "_JEszRdKFzkb",
165 |         "colab_type": "text"
166 |       },
167 |       "source": [
168 |         "### Validate Baseline Predictor"
169 |       ]
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "metadata": {
174 |         "colab_type": "code",
175 |         "id": "fJ0rp0_BGeSb",
176 |         "colab": {}
177 |       },
178 |       "source": [
179 |         "import pickle\n",
180 |         "import os\n",
181 |         "import chainer as ch\n",
182 |         "from chainer import datasets\n",
183 |         "from src.dataset import EncodedDataset, Dataset\n",
184 |         "import src.inference as I\n",
185 |         "from src.model import ModelShapeParameters\n",
186 |         "from tqdm import tqdm_notebook as tqdm\n",
187 |         "\n",
188 |         "with open(valid_dataset_path, \"rb\") as f:\n",
189 |         "    dataset: Dataset = pickle.load(f)\n",
190 |         "\n",
191 |         "with open(train_dataset_path, \"rb\") as f:\n",
192 |         "    train: ch.datasets.TupleDataset = pickle.load(f).dataset\n",
193 |         "pred = I.predict_with_prior_distribution(train)\n",
194 |         "\n",
195 |         "results = dict([])\n",
196 |         "num_succ = 0\n",
197 |         "for i, (entry,) in enumerate(tqdm(dataset.dataset)):\n",
198 |         "    result = I.search(\n",
199 |         "        os.path.join(os.getcwd(), \"DeepCoder_Utils\",\n",
200 |         "                     \"enumerative-search\", \"search\"),\n",
201 |         "        timeout_second,\n",
202 |         "        dataset.metadata.value_range,\n",
203 |         "        entry.examples,\n",
204 |         "        max_program_length,\n",
205 |         "        pred\n",
206 |         "    )\n",
207 |         "    results[i] = result\n",
208 |         "    if result.is_solved:\n",
209 |         "        num_succ += 1\n",
210 |         "\n",
211 |         "print(\"Solved: {} of {} examples\".format(num_succ, len(dataset.dataset)))\n"
212 |       ],
213 |       "execution_count": 0,
214 |       "outputs": []
215 |     },
216 |     {
217 |       "cell_type": "markdown",
218 |       "metadata": {
219 |         "id": "YWufhkoaw9Bq",
220 |         "colab_type": "text"
221 |       },
222 |       "source": [
223 |         "### Teardown\n",
224 |         "* Save the baseline result"
225 |       ]
226 |     },
227 |     {
228 |       "cell_type": "code",
229 |       "metadata": {
230 |         "id": "D-WYlqxVkO5i",
231 |         "colab_type": "code",
232 |         "colab": {}
233 |       },
234 |       "source": [
235 |         "import os\n",
236 |         "import chainer as ch\n",
237 |         "\n",
238 |         "if not os.path.exists(destination_path):\n",
239 |         "    os.makedirs(destination_path)\n",
240 |         "\n",
241 |         "with open(os.path.join(destination_path, \"result.pickle\"), \"wb\") as f:\n",
242 |         "    pickle.dump(results, f)\n"
243 |       ],
244 |       "execution_count": 0,
245 |       "outputs": []
246 |     }
247 |   ]
248 | }


--------------------------------------------------------------------------------
/test/test_program_simplifier.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from src.dsl import Function, Type, Variable, Expression, Program, Signature, Statement
  4 | from src.program_simplifier import normalize, remove_redundant_variables, remove_redundant_expressions, remove_dependency_between_variables
  5 | 
  6 | 
  7 | class Test_program_simplifier(unittest.TestCase):
  8 |     def test_normalize(self):
  9 |         F = Function("FUNC", Signature([Type.Int], Type.IntList))
 10 |         p = Program([Variable(1, Type.Int), Variable(0, Type.IntList)], [])
 11 |         p = normalize(p)
 12 |         self.assertEqual(
 13 |             p,
 14 |             Program([Variable(0, Type.IntList), Variable(1, Type.Int)], [])
 15 |         )
 16 | 
 17 |         p = Program([Variable(2, Type.Int), Variable(0, Type.IntList)], [
 18 |                     Statement(Variable(3, Type.Int), Expression(F, [Variable(2, Type.Int)]))])
 19 |         p = normalize(p)
 20 |         self.assertEqual(
 21 |             p,
 22 |             Program([Variable(0, Type.IntList), Variable(1, Type.Int)], [
 23 |                     Statement(Variable(2, Type.Int), Expression(F, [Variable(1, Type.Int)]))])
 24 |         )
 25 | 
 26 |     def test_remove_redundant_variables(self):
 27 |         F = Function("FUNC", Signature([Type.Int], Type.IntList))
 28 |         p = Program([Variable(0, Type.Int), Variable(1, Type.IntList)], [])
 29 |         p = remove_redundant_variables(p)
 30 |         self.assertEqual(p, Program([], []))
 31 | 
 32 |         p = Program([Variable(0, Type.Int), Variable(1, Type.IntList)], [
 33 |                     Statement(Variable(2, Type.Int), Expression(F, [Variable(0, Type.Int)]))])
 34 |         p = remove_redundant_variables(p)
 35 |         self.assertEqual(
 36 |             p,
 37 |             Program([Variable(0, Type.Int)], [
 38 |                     Statement(Variable(2, Type.Int), Expression(F, [Variable(0, Type.Int)]))])
 39 |         )
 40 | 
 41 |         p = Program([Variable(0, Type.Int)], [Statement(Variable(1, Type.Int), Expression(F, [Variable(
 42 |             0, Type.Int)])), Statement(Variable(2, Type.Int), Expression(F, [Variable(0, Type.Int)]))])
 43 |         p = remove_redundant_variables(p)
 44 |         self.assertEqual(
 45 |             p,
 46 |             Program([Variable(0, Type.Int)], [
 47 |                     Statement(Variable(2, Type.Int), Expression(F, [Variable(0, Type.Int)]))])
 48 |         )
 49 | 
 50 |     def test_remove_redundant_expressions(self):
 51 |         F = Function("F", Signature([Type.IntList], Type.IntList))
 52 |         SORT = Function("SORT", Signature([Type.IntList], Type.IntList))
 53 |         REVERSE = Function("REVERSE", Signature([Type.IntList], Type.IntList))
 54 | 
 55 |         p = Program([Variable(0, Type.IntList)], [
 56 |                     Statement(Variable(1, Type.Int), Expression(F, [Variable(0, Type.IntList)]))])
 57 |         p = remove_redundant_expressions(p)
 58 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
 59 |                          Statement(Variable(1, Type.Int), Expression(F, [Variable(0, Type.IntList)]))]))
 60 | 
 61 |         p = Program([Variable(0, Type.IntList)], [
 62 |             Statement(Variable(1, Type.IntList), Expression(
 63 |                 SORT, [Variable(0, Type.IntList)])),
 64 |             Statement(Variable(2, Type.IntList), Expression(
 65 |                 SORT, [Variable(0, Type.IntList)])),
 66 |             Statement(Variable(3, Type.IntList), Expression(
 67 |                 F, [Variable(2, Type.IntList)]))
 68 |         ])
 69 |         p = remove_redundant_expressions(p)
 70 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
 71 |             Statement(Variable(1, Type.IntList), Expression(
 72 |                 SORT, [Variable(0, Type.IntList)])),
 73 |             Statement(Variable(3, Type.IntList), Expression(
 74 |                 F, [Variable(1, Type.IntList)])),
 75 |         ]))
 76 | 
 77 |         p = Program([Variable(0, Type.IntList)], [
 78 |             Statement(Variable(1, Type.IntList), Expression(
 79 |                 SORT, [Variable(0, Type.IntList)])),
 80 |             Statement(Variable(2, Type.IntList), Expression(
 81 |                 SORT, [Variable(1, Type.IntList)])),
 82 |             Statement(Variable(3, Type.IntList), Expression(
 83 |                 F, [Variable(2, Type.IntList)]))
 84 |         ])
 85 |         p = remove_redundant_expressions(p)
 86 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
 87 |             Statement(Variable(1, Type.IntList), Expression(
 88 |                 SORT, [Variable(0, Type.IntList)])),
 89 |             Statement(Variable(3, Type.IntList), Expression(
 90 |                 F, [Variable(1, Type.IntList)])),
 91 |         ]))
 92 | 
 93 |         p = Program([Variable(0, Type.IntList)], [
 94 |             Statement(Variable(1, Type.IntList), Expression(
 95 |                 REVERSE, [Variable(0, Type.IntList)])),
 96 |             Statement(Variable(2, Type.IntList), Expression(
 97 |                 REVERSE, [Variable(1, Type.IntList)])),
 98 |             Statement(Variable(3, Type.IntList), Expression(
 99 |                 F, [Variable(2, Type.IntList)]))
100 |         ])
101 |         p = remove_redundant_expressions(p)
102 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
103 |             Statement(Variable(1, Type.IntList), Expression(
104 |                 REVERSE, [Variable(0, Type.IntList)])),
105 |             Statement(Variable(3, Type.IntList), Expression(
106 |                 F, [Variable(0, Type.IntList)])),
107 |         ]))
108 | 
109 |     def test_remove_dependency_between_variables(self):
110 |         F = Function("F", Signature([Type.IntList], Type.IntList))
111 |         SORT = Function("SORT", Signature([Type.IntList], Type.IntList))
112 |         REVERSE = Function("REVERSE", Signature([Type.IntList], Type.IntList))
113 |         MAXIMUM = Function("MAXIMUM", Signature([Type.IntList], Type.Int))
114 |         MINIMUM = Function("MINIMUM", Signature([Type.IntList], Type.Int))
115 |         SUM = Function("SUM", Signature([Type.IntList], Type.Int))
116 |         HEAD = Function("HEAD", Signature([Type.IntList], Type.Int))
117 |         LAST = Function("LAST", Signature([Type.IntList], Type.Int))
118 | 
119 |         p = Program([Variable(0, Type.IntList)], [
120 |                     Statement(Variable(1, Type.Int), Expression(F, [Variable(0, Type.IntList)]))])
121 |         p = remove_dependency_between_variables(p, MINIMUM, MAXIMUM)
122 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
123 |                          Statement(Variable(1, Type.Int), Expression(F, [Variable(0, Type.IntList)]))]))
124 | 
125 |         p = Program([Variable(0, Type.IntList)], [
126 |             Statement(Variable(1, Type.IntList), Expression(
127 |                 REVERSE, [Variable(0, Type.IntList)])),
128 |             Statement(Variable(2, Type.IntList), Expression(
129 |                 SUM, [Variable(1, Type.IntList)]))
130 |         ])
131 |         p = remove_dependency_between_variables(p, MINIMUM, MAXIMUM)
132 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
133 |             Statement(Variable(1, Type.IntList), Expression(
134 |                 REVERSE, [Variable(0, Type.IntList)])),
135 |             Statement(Variable(2, Type.IntList), Expression(
136 |                 SUM, [Variable(0, Type.IntList)])),
137 |         ]))
138 | 
139 |         p = Program([Variable(0, Type.IntList)], [
140 |             Statement(Variable(1, Type.IntList), Expression(
141 |                 SORT, [Variable(0, Type.IntList)])),
142 |             Statement(Variable(2, Type.IntList), Expression(
143 |                 HEAD, [Variable(1, Type.IntList)]))
144 |         ])
145 |         p = remove_dependency_between_variables(p, MINIMUM, MAXIMUM)
146 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
147 |             Statement(Variable(1, Type.IntList), Expression(
148 |                 SORT, [Variable(0, Type.IntList)])),
149 |             Statement(Variable(2, Type.IntList), Expression(
150 |                 MINIMUM, [Variable(0, Type.IntList)])),
151 |         ]))
152 | 
153 |         p = Program([Variable(0, Type.IntList)], [
154 |             Statement(Variable(1, Type.IntList), Expression(
155 |                 SORT, [Variable(0, Type.IntList)])),
156 |             Statement(Variable(2, Type.IntList), Expression(
157 |                 LAST, [Variable(1, Type.IntList)]))
158 |         ])
159 |         p = remove_dependency_between_variables(p, MINIMUM, MAXIMUM)
160 |         self.assertEqual(p, Program([Variable(0, Type.IntList)], [
161 |             Statement(Variable(1, Type.IntList), Expression(
162 |                 SORT, [Variable(0, Type.IntList)])),
163 |             Statement(Variable(2, Type.IntList), Expression(
164 |                 MAXIMUM, [Variable(0, Type.IntList)])),
165 |         ]))
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     unittest.main()
170 | 


--------------------------------------------------------------------------------
/inspect_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder inspect-model",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  drive.mount('/gdrive')\n",
 41 |         "  runtime = \"host\"\n",
 42 |         "except:\n",
 43 |         "  runtime = \"local\""
 44 |       ],
 45 |       "execution_count": 0,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "_S457sT6QMUr",
 52 |         "colab_type": "text"
 53 |       },
 54 |       "source": [
 55 |         "### Parameters"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "metadata": {
 61 |         "id": "2LYvG4iCQUwh",
 62 |         "colab_type": "code",
 63 |         "colab": {}
 64 |       },
 65 |       "source": [
 66 |         "#@title Parameters\n",
 67 |         "#@markdown |Name            |Description|\n",
 68 |         "#@markdown |:---            |:---|\n",
 69 |         "#@markdown |`seed`|The random seed|\n",
 70 |         "seed = 3984 #@param {type: \"number\"}\n",
 71 |         "\n",
 72 |         "#@markdown ### `deep-coder` Repositories\n",
 73 |         "#@markdown |Name            |Description|\n",
 74 |         "#@markdown |:---            |:---|\n",
 75 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 76 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 77 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 78 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 79 |         "\n",
 80 |         "#@markdown ### Filepathes\n",
 81 |         "#@markdown |Name              |Description|\n",
 82 |         "#@markdown |:---              |:---|\n",
 83 |         "#@markdown |`model_shape_path`|The file path of the model shape.|\n",
 84 |         "#@markdown |`model_path`      |The file path of the model parameters.|\n",
 85 |         "model_shape_path = \"out/model-shape.pickle\" #@param {type: \"string\"}\n",
 86 |         "model_path = \"out/model.npz\" #@param {type: \"string\"}\n",
 87 |         "\n"
 88 |       ],
 89 |       "execution_count": 0,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "cell_type": "markdown",
 94 |       "metadata": {
 95 |         "id": "_BembldCdOO1",
 96 |         "colab_type": "text"
 97 |       },
 98 |       "source": [
 99 |         "### Setup\n",
100 |         "* Fix the random seed\n",
101 |         "* Download the codebase (when using the host runtime)\n",
102 |         "  1. Clone git repository and move to the specified branch\n",
103 |         "  2. Initialize submodule\n",
104 |         "  3. Build the `search` tool\n",
105 |         "  4. Install chainer and cupy"
106 |       ]
107 |     },
108 |     {
109 |       "cell_type": "code",
110 |       "metadata": {
111 |         "id": "GwjlAkY1fR5j",
112 |         "colab_type": "code",
113 |         "colab": {}
114 |       },
115 |       "source": [
116 |         "import numpy as np\n",
117 |         "import random\n",
118 |         "\n",
119 |         "SEED_MAX = 2**32 - 1\n",
120 |         "\n",
121 |         "root_rng = np.random.RandomState(seed)\n",
122 |         "random.seed(root_rng.randint(SEED_MAX))\n",
123 |         "np.random.seed(root_rng.randint(SEED_MAX))"
124 |       ],
125 |       "execution_count": 0,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "metadata": {
131 |         "id": "FIZJmuz8QFn_",
132 |         "colab_type": "code",
133 |         "colab": {}
134 |       },
135 |       "source": [
136 |         "if runtime == \"host\":\n",
137 |         "  %cd /content\n",
138 |         "  !rm -rf deep-coder\n",
139 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
140 |         "  %cd deep-coder\n",
141 |         "  !git checkout origin/$branch_name\n",
142 |         "  !git submodule init\n",
143 |         "  !git submodule update\n",
144 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
145 |         "  !curl https://colab.chainer.org/install | sh -"
146 |       ],
147 |       "execution_count": 0,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "metadata": {
153 |         "id": "Oz7sdzxUi70b",
154 |         "colab_type": "text"
155 |       },
156 |       "source": [
157 |         "### Load Model\n",
158 |         "* Load model"
159 |       ]
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "metadata": {
164 |         "id": "h7kdglcUjDTQ",
165 |         "colab_type": "code",
166 |         "colab": {}
167 |       },
168 |       "source": [
169 |         "import pickle\n",
170 |         "import chainer as ch\n",
171 |         "from src.model import ModelShapeParameters, Predictor\n",
172 |         "\n",
173 |         "# Load model\n",
174 |         "with open(model_shape_path, \"rb\") as f:\n",
175 |         "    model_shape: ModelShapeParameters = pickle.load(f)\n",
176 |         "predictor = Predictor(model_shape)\n",
177 |         "ch.serializers.load_npz(model_path, predictor)"
178 |       ],
179 |       "execution_count": 0,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "cell_type": "markdown",
184 |       "metadata": {
185 |         "id": "kr3F1N-V5ZCi",
186 |         "colab_type": "text"
187 |       },
188 |       "source": [
189 |         "### Visualize Model\n",
190 |         "* Show integer embeddings"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "metadata": {
196 |         "id": "q6Ik8J8MAcbv",
197 |         "colab_type": "code",
198 |         "colab": {}
199 |       },
200 |       "source": [
201 |         "#@markdown ### Visualization Parameters\n",
202 |         "#@markdown |Name    |Description|\n",
203 |         "#@markdown |:---    |:---|\n",
204 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
205 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
206 |         "#@markdown ---\n",
207 |         "width = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
208 |         "height = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
209 |       ],
210 |       "execution_count": 0,
211 |       "outputs": []
212 |     },
213 |     {
214 |       "cell_type": "code",
215 |       "metadata": {
216 |         "id": "TdtM61G15xJx",
217 |         "colab_type": "code",
218 |         "colab": {}
219 |       },
220 |       "source": [
221 |         "import matplotlib.pyplot as plt\n",
222 |         "\n",
223 |         "plt.figure(figsize=(width, height))\n",
224 |         "\n",
225 |         "embed = list(predictor.children())[0]._embed_integer\n",
226 |         "axis_0, axis_1 = np.random.choice(model_shape.n_embed, 2, replace=False)\n",
227 |         "for i in range(-model_shape.dataset_metadata.value_range, model_shape.dataset_metadata.value_range):\n",
228 |         "    e = embed(np.array([i + model_shape.dataset_metadata.value_range]))\n",
229 |         "    x = e.array[0, axis_0]\n",
230 |         "    y = e.array[0, axis_1]\n",
231 |         "\n",
232 |         "    if i == 0:\n",
233 |         "        color = \"b\"\n",
234 |         "    elif i > 0:\n",
235 |         "        color = \"g\"\n",
236 |         "    else:\n",
237 |         "        color = \"r\"\n",
238 |         "\n",
239 |         "    if i % 2 == 0:\n",
240 |         "        shape = \"s\"\n",
241 |         "    else:\n",
242 |         "        shape = \"^\"\n",
243 |         "\n",
244 |         "    plt.plot(x, y, \"{}{}\".format(color, shape))\n",
245 |         "    if abs(i) < 10 or abs(i) > 253:\n",
246 |         "        plt.annotate(\"{}\".format(i), xy=(x, y))\n",
247 |         "\n",
248 |         "e = embed(np.array([2 * model_shape.dataset_metadata.value_range]))\n",
249 |         "x = e.array[0, axis_0]\n",
250 |         "y = e.array[0, axis_1]\n",
251 |         "plt.plot(x, y, \"x\")\n",
252 |         "plt.annotate(\"Null\", xy=(x, y))\n"
253 |       ],
254 |       "execution_count": 0,
255 |       "outputs": []
256 |     }
257 |   ]
258 | }


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import numpy as np
  3 | import chainer as ch
  4 | from chainer import link
  5 | import chainer.links as L
  6 | import chainer.functions as F
  7 | from chainer import cuda
  8 | from chainer import backend
  9 | from chainer import reporter
 10 | from typing import List, Union, Dict
 11 | from src.dataset import DatasetMetadata
 12 | 
 13 | 
 14 | @dataclasses.dataclass
 15 | class ModelShapeParameters:
 16 |     dataset_metadata: DatasetMetadata
 17 |     num_hidden_layers: int
 18 |     n_embed: int
 19 |     n_units: int
 20 | 
 21 | 
 22 | def weighted_sigmoid_cross_entropy(y, t, w_0: float = 0.5):
 23 |     """
 24 |     Compute weighted sigmoid cross entropy
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     y
 29 |         The prediction vector
 30 |     t
 31 |         The ground truth label
 32 |     w_0 : float
 33 |         The weight for label=0.
 34 |         If this value is negative, this function computes the original sigmoid cross entropy
 35 | 
 36 |     Returns
 37 |     -------
 38 |     ch.Variable
 39 |         computed cross entropy
 40 |     """
 41 |     xp = backend.get_array_module(y)
 42 |     if w_0 < 0:
 43 |         return F.sigmoid_cross_entropy(y, xp.array(t))
 44 | 
 45 |     t_0 = cuda.to_cpu(t.copy())
 46 |     t_1 = cuda.to_cpu(t.copy())
 47 |     t_0[t_0 == 1] = -1
 48 |     t_1[t_0 == 0] = -1
 49 |     t_0 = xp.array(t_0)
 50 |     t_1 = xp.array(t_1)
 51 | 
 52 |     l0 = F.sigmoid_cross_entropy(y, t_0)
 53 |     l1 = F.sigmoid_cross_entropy(y, t_1)
 54 |     return w_0 * l0 + (1.0 - w_0) * l1
 55 | 
 56 | 
 57 | def tupled_binary_accuracy(y, t):
 58 |     """
 59 |     Compte binary classification accuracy
 60 | 
 61 |     Attributes
 62 |     ----------
 63 |     y
 64 |         The output predictions
 65 |     t
 66 |         The ground truth label
 67 |     """
 68 | 
 69 |     xp = backend.get_array_module(y)
 70 |     t_0 = cuda.to_cpu(t.copy())
 71 |     t_1 = cuda.to_cpu(t.copy())
 72 |     t_0[t_0 == 1] = -1
 73 |     t_1[t_0 == 0] = -1
 74 |     t_0 = xp.array(t_0)
 75 |     t_1 = xp.array(t_1)
 76 |     acc_0 = F.binary_accuracy(y, t_0)
 77 |     acc_1 = F.binary_accuracy(y, t_1)
 78 | 
 79 |     return acc_0, acc_1
 80 | 
 81 | 
 82 | class ExampleEmbed(link.Chain):
 83 |     """
 84 |     The embeded link of DeepCoder
 85 |     """
 86 | 
 87 |     def __init__(self, num_inputs: int, value_range: int, n_embed: int,
 88 |                  initialW: Union[None, np.array, ch.Initializer] = None):
 89 |         """
 90 |         Constructor
 91 | 
 92 |         Parameters
 93 |         ----------
 94 |         num_inputs : int
 95 |             The largest number of the inputs
 96 |         value_range : int
 97 |             The largest absolute value used in the dataset.
 98 |         n_embed : int
 99 |             The dimension of integer embedding. 20 was used in the paper.
100 |         initialW : ch.Initializer or np.array or None
101 |             The initial value of the weights
102 |         """
103 |         super(ExampleEmbed, self).__init__()
104 | 
105 |         with self.init_scope():
106 |             self._embed_integer = L.EmbedID(
107 |                 2 * value_range + 1, n_embed, initialW=initialW)
108 |         self._value_range = value_range
109 |         self._num_inputs = num_inputs
110 | 
111 |     def forward(self, types: np.array, values: np.array):
112 |         """
113 |         Computes the hidden layer encoding
114 | 
115 |         Parameters
116 |         ----------
117 |         types : np.array
118 |             Each element contains one-hot vectors of inputs and output types
119 |         values : np.array
120 |             Each element contains encodings of primitives
121 | 
122 |         Returns
123 |         -------
124 |         chainer.Variable
125 |             The hidden layer encoding. The shape is (N, e, (num_inputs + 1), 2 + max_list_length * n_embed)
126 |             where
127 |                 N is the minibatch size,
128 |                 e is the number of examples,
129 |                 num_inputs is the largest number of the inputs,
130 |                 max_list_length is the length of value encoding, and
131 |                 n_embed is the dimension of integer embedding.
132 |         """
133 | 
134 |         N = types.shape[0]  # minibatch size
135 |         e = types.shape[1]  # num of I/O examples
136 |         num_inputs = types.shape[2] - 1
137 |         max_list_length = values.shape[-1]
138 | 
139 |         # Convert the integer into the learned embeddings
140 |         # (N, e, (num_inputs + 1), max_list_length, n_embed)
141 |         values_embeddings = self._embed_integer(values)
142 | 
143 |         # Concat types and values
144 |         n_embed = values_embeddings.shape[4]
145 |         # (N, e, (num_inputs + 1), max_list_length * n_embed)
146 |         values_embeddings = F.reshape(
147 |             values_embeddings, (N, e, num_inputs + 1, -1))
148 |         # (N, e, (num_inputs + 1), 2 + max_list_length * n_embed)
149 |         state_embeddings = F.concat([types, values_embeddings], axis=3)
150 | 
151 |         return state_embeddings
152 | 
153 | 
154 | class Encoder(link.Chain):
155 |     """
156 |     The encoder neural network of DeepCoder
157 |     """
158 | 
159 |     def __init__(self, n_units: int,
160 |                  num_hidden_layers: int = 3,
161 |                  initialW: Union[None, ch.Initializer] = None,
162 |                  initial_bias: Union[None, ch.Initializer] = None):
163 |         """
164 |         Constructor
165 | 
166 |         Parameters
167 |         ----------
168 |         n_units : int
169 |             The number of units in the hidden layers. 256 was used in the paper.
170 |         num_hidden_layers : int
171 |             The number of hidden layers. 3 was used in the paper.
172 |         initialW : ch.Initializer or None
173 |             The initial value of the weights
174 |         initial_bias : ch.Initializer or None
175 |             The initial value of the biases
176 |         """
177 |         super(Encoder, self).__init__()
178 | 
179 |         linears = []
180 |         with self.init_scope():
181 |             layer = ch.Sequential(
182 |                 L.Linear(n_units, initialW=initialW,
183 |                          initial_bias=initial_bias),
184 |                 F.sigmoid
185 |             )
186 |             self._hidden = layer.repeat(num_hidden_layers)
187 | 
188 |     def forward(self, state_embeddings: np.array):
189 |         """
190 |         Computes the hidden layer encoding
191 | 
192 |         Parameters
193 |         ----------
194 |         state_embeddings : np.array
195 |             The state embeddings of the examples.
196 |             The shape is (N, e, (num_inputs + 1), 2 + max_list_length * n_embed).
197 | 
198 |         Returns
199 |         -------
200 |         chainer.Variable
201 |             The hidden layer encoding. The shape is (N, e, n_units)
202 |             where
203 |                 N is the minibatch size,
204 |                 e is the number of examples, and
205 |                 n_unit is the number of units in the hidden layers.
206 |         """
207 | 
208 |         N = state_embeddings.shape[0]  # minibatch size
209 |         e = state_embeddings.shape[1]
210 | 
211 |         # Compute the hidden layer encoding
212 |         # (N * e, (num_inputs + 1) * (2 + max_list_length * n_embed))
213 |         state_embeddings = F.reshape(state_embeddings, (N * e, -1))
214 |         output = self._hidden(state_embeddings)  # (N * e, n_units)
215 |         output = F.reshape(output, (N, e, -1))
216 |         return output
217 | 
218 | 
219 | def Decoder(n_functions: int, initialW: Union[None, ch.Initializer, np.array] = None,
220 |             initial_bias: Union[None, ch.Initializer, np.array] = None):
221 |     """
222 |     Returns the decoder of DeepCoder
223 | 
224 |     Parameters
225 |     ----------
226 |     n_functions : int
227 |         The number of functions
228 | 
229 |     Returns
230 |     -------
231 |     chainer.Link
232 |         The decoder of DeepCoder.
233 |     """
234 |     return ch.Sequential(
235 |         # Input: (N, e, n_units)
236 |         lambda x: F.mean(x, axis=1),
237 |         # Pooled: (N, n_units)
238 |         L.Linear(n_functions, initialW=initialW, initial_bias=initial_bias),
239 |         # (N, n_functions)
240 |     )
241 | 
242 | 
243 | def Predictor(params: ModelShapeParameters) -> ch.Link:
244 |     """
245 |     Return the model of DeepCoder
246 | 
247 |     Parameters
248 |     ----------
249 |     params : ModelShapeParameters
250 | 
251 |     Returns
252 |     -------
253 |     ch.Link
254 |         The model of DeepCoder
255 |     """
256 |     embed = ExampleEmbed(params.dataset_metadata.max_num_inputs,
257 |                          params.dataset_metadata.value_range, params.n_embed)
258 |     encoder = Encoder(
259 |         params.n_units, num_hidden_layers=params.num_hidden_layers)
260 |     decoder = Decoder(len(params.dataset_metadata.symbols))
261 | 
262 |     return ch.Sequential(embed, encoder, decoder)
263 | 
264 | 
265 | def TrainingClassifier(predictor: ch.Link, w_0: float = -1):
266 |     """
267 |     Return the classifier for training DeepCoder
268 | 
269 |     Parameters
270 |     ----------
271 |     predictor : ch.Link
272 |     w_0 : float
273 |         The weight for label=False
274 | 
275 |     Returns
276 |     -------
277 |     chainer.Link
278 |         The classifier used for training
279 |     """
280 | 
281 |     classifier = L.Classifier(
282 |         predictor,
283 |         lossfun=lambda y, t: weighted_sigmoid_cross_entropy(y, t, w_0),
284 |         accfun=F.binary_accuracy
285 |     )
286 | 
287 |     def accuracy(y, t):
288 |         xp = backend.get_array_module(y)
289 |         acc_0, acc_1 = tupled_binary_accuracy(y, t)
290 |         reporter.report(
291 |             {"accuracy_false": acc_0, "accuracy_true": acc_1}, classifier)
292 |         return F.binary_accuracy(y, xp.array(t))
293 |     classifier.accfun = accuracy
294 |     return classifier
295 | 


--------------------------------------------------------------------------------
/src/program_simplifier.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from .dsl import Function, Type, Variable, Expression, Program, Statement
  3 | 
  4 | 
  5 | def normalize(program: Program):
  6 |     """
  7 |     Return the normalized program
  8 |     This function applies 2 transformations:
  9 |     1) sort input variables by order of use
 10 |     2) make the variable ids consecutive numbers
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     program : Program
 15 |         The program that will be normalized
 16 | 
 17 |     Returns
 18 |     -------
 19 |     program : Program
 20 |         The normalized program
 21 | 
 22 |     Notes
 23 |     -----
 24 |     This function modify the program object of the argument
 25 |     to reduce runtime overhead.
 26 |     """
 27 |     program = program.clone()  # Clone program to isolate the argument from modifications
 28 | 
 29 |     # inputs should be sorted by id
 30 |     program.inputs.sort(key=lambda i: i.id)
 31 | 
 32 |     # Re-assign id
 33 |     old_id_to_new_id = dict()
 34 |     for i in program.inputs:
 35 |         old_id = i.id
 36 |         i.id = len(old_id_to_new_id)
 37 |         old_id_to_new_id[old_id] = i.id
 38 |     for statement in program.body:
 39 |         old_id = statement.variable.id
 40 |         statement.variable.id = len(old_id_to_new_id)
 41 |         old_id_to_new_id[old_id] = statement.variable.id
 42 |         for arg in statement.expression.arguments:
 43 |             arg.id = old_id_to_new_id[arg.id]
 44 | 
 45 |     return program
 46 | 
 47 | 
 48 | def remove_redundant_variables(program: Program) -> Program:
 49 |     """
 50 |     Return the program that is removed the redundant variables
 51 |     For examples, the program A will be converted to the program B
 52 |     ```
 53 |     <Program A>
 54 |     a <- [int]
 55 |     b <- int
 56 |     c <- TAKE b a
 57 |     d <- REVERSE a
 58 | 
 59 |     <Program B>
 60 |     a <- [int]
 61 |     d <- REVERSE a
 62 |     ```
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     program : Program
 67 |         The program that will be simplified
 68 | 
 69 |     Returns
 70 |     -------
 71 |     Program
 72 |         The simplified program
 73 |     """
 74 | 
 75 |     program = program.clone()  # Clone program to isolate the argument from modifications
 76 | 
 77 |     inputs = []
 78 |     body = []
 79 |     if len(program.body) == 0:
 80 |         program.inputs.clear()
 81 |         program.body.clear()
 82 |         return program
 83 | 
 84 |     # Last line is always used (because it is output value)
 85 |     v_used = set([program.body[-1].variable])
 86 |     for statement in program.body[::-1]:
 87 |         if statement.variable in v_used:
 88 |             # v is not a redundant variable
 89 |             body.append(statement)
 90 |             for a in statement.expression.arguments:
 91 |                 v_used.add(a)
 92 |     body.reverse()
 93 | 
 94 |     for v in program.inputs:
 95 |         if v in v_used:
 96 |             # v is not a redundant variable
 97 |             inputs.append(v)
 98 | 
 99 |     program.inputs = inputs
100 |     program.body = body
101 |     return program
102 | 
103 | 
104 | def remove_redundant_expressions(program: Program) -> Program:
105 |     """
106 |     Return the program that is removed the redundant expressions
107 |     This function applies following 3 rules:
108 | 
109 |     Rule1: The duplicated expressions will be merged
110 |     ```
111 |     <Program>
112 |     a <- [int]
113 |     b <- REVERSE a
114 |     c <- REVERSE a
115 |     d <- ZIPWITH * b c
116 | 
117 |     <Program returned by this function>
118 |     a <- [int]
119 |     b <- REVERSE a
120 |     d <- ZIPWITH * b b
121 |     ```
122 | 
123 |     Rule2: SORT function for the sorted list will be removed
124 |     ```
125 |     <Program>
126 |     a <- [int]
127 |     b <- SORT a
128 |     c <- SORT a
129 |     d <- ZIPWITH * b c
130 | 
131 |     <Program returned by this function>
132 |     a <- [int]
133 |     b <- SORT a
134 |     d <- ZIPWITH * b b
135 |     ```
136 | 
137 |     Rule3: REVERSE function for the reversed list will be removed
138 |     ```
139 |     <Program>
140 |     a <- [int]
141 |     b <- REVERSE a
142 |     c <- REVERSE a
143 |     d <- ZIPWITH * b c
144 | 
145 |     <Program returned by this function>
146 |     a <- [int]
147 |     b <- REVERSE a
148 |     d <- ZIPWITH * b a
149 |     ```
150 | 
151 |     Parameters
152 |     ----------
153 |     program : Program
154 |         The program that will be simplified
155 | 
156 |     Returns
157 |     -------
158 |     Program
159 |         The simplified program
160 |     """
161 |     program = program.clone()  # Clone program to isolate the argument from modifications
162 | 
163 |     replacement = dict()  # Variable -> Variable
164 |     expression_to_variable = dict()  # (str, [Variable]) -> Variable
165 |     variable_to_expression = dict()  # Variable -> Expression
166 | 
167 |     body = []
168 |     for statement in program.body:
169 |         if (statement.expression.function.name, tuple(statement.expression.arguments)) in expression_to_variable:
170 |             # Rule1
171 |             replacement[statement.variable] = expression_to_variable[(
172 |                 statement.expression.function.name, tuple(statement.expression.arguments))]
173 |             continue
174 |         if len(statement.expression.arguments) > 0 and statement.expression.arguments[0] in variable_to_expression:
175 |             exp_arg1 = variable_to_expression[statement.expression.arguments[0]]
176 |             if statement.expression.function.name == "SORT" and (exp_arg1.function.name == "SORT"):
177 |                 # Rule2
178 |                 replacement[statement.variable] = statement.expression.arguments[0]
179 |                 continue
180 |             if statement.expression.function.name == "REVERSE" and (exp_arg1.function.name == "REVERSE"):
181 |                 # Rule3
182 |                 replacement[statement.variable] = variable_to_expression[statement.expression.arguments[0]].arguments[0]
183 |                 continue
184 | 
185 |         for i, arg in enumerate(statement.expression.arguments):
186 |             if arg in replacement:
187 |                 statement.expression.arguments[i] = replacement[arg]
188 |         body.append(statement)
189 | 
190 |         expression_to_variable[(statement.expression.function.name, tuple(
191 |             statement.expression.arguments))] = statement.variable
192 |         variable_to_expression[statement.variable] = statement.expression
193 |     program.body = body
194 | 
195 |     return program
196 | 
197 | 
198 | def remove_dependency_between_variables(program: Program, minimum: Function, maximum: Function) -> Program:
199 |     """
200 |     Return the program that is reduced dependencies between variables
201 |     This function applies following 3 rules:
202 | 
203 |     Rule1: Reordering functions (REVERSE, SORT) before reduce functions
204 |            (SUM, MAXIMUM, MINIMUM) will be ignored
205 |     ```
206 |     <Program>
207 |     a <- [int]
208 |     b <- REVERSE a
209 |     c <- SUM b
210 | 
211 |     <Program returned by this function>
212 |     a <- [int]
213 |     b <- REVERSE a
214 |     c <- SUM a
215 |     ```
216 | 
217 |     Rule2: HEAD function to a sorted list will be converted
218 |            into MINIMUM function
219 |     ```
220 |     <Program>
221 |     a <- [int]
222 |     b <- SORT a
223 |     c <- HEAD b
224 | 
225 |     <Program returned by this function>
226 |     a <- [int]
227 |     b <- SORT a
228 |     c <- MINIMUM a
229 |     ```
230 | 
231 |     Rule3: LAST function to a sorted list will be converted
232 |            into MAXIMUM function
233 |     ```
234 |     <Program>
235 |     a <- [int]
236 |     b <- SORT a
237 |     c <- LAST b
238 | 
239 |     <Program returned by this function>
240 |     a <- [int]
241 |     b <- SORT a
242 |     c <- MAXIMUM a
243 |     ```
244 | 
245 |     Parameters
246 |     ----------
247 |     program : Program
248 |         The program that will be simplified
249 |     minimum : Function
250 |         The MINIMUM function
251 |     maximum : Function
252 |         The MAXIMUM function
253 | 
254 |     Returns
255 |     -------
256 |     Program
257 |         The simplified program
258 |     """
259 |     program = program.clone()  # Clone program to isolate the argument from modifications
260 | 
261 |     variable_to_expression = dict()  # Variable -> Expression
262 | 
263 |     body = []
264 |     for statement in program.body:
265 |         if len(statement.expression.arguments) > 0 and statement.expression.arguments[0] in variable_to_expression:
266 |             exp_arg1 = variable_to_expression[statement.expression.arguments[0]]
267 |             if (exp_arg1.function.name == "SORT" or exp_arg1.function.name == "REVERSE") and (statement.expression.function.name == "SUM" or statement.expression.function.name == "MAXIMUM" or statement.expression.function.name == "MINIMUM"):
268 |                 # Rule1
269 |                 x = exp_arg1.arguments[0]
270 |                 exp = Expression(statement.expression.function, [x])
271 |                 body.append(Statement(statement.variable, exp))
272 |                 variable_to_expression[statement.variable] = statement.expression
273 |                 continue
274 |             if exp_arg1.function.name == "SORT" and (statement.expression.function.name == "HEAD"):
275 |                 # Rule2
276 |                 x = exp_arg1.arguments[0]
277 |                 exp = Expression(minimum, [x])
278 |                 body.append(Statement(statement.variable, exp))
279 |                 variable_to_expression[statement.variable] = exp
280 |                 continue
281 |             if exp_arg1.function.name == "SORT" and (statement.expression.function.name == "LAST"):
282 |                 # Rule3
283 |                 x = exp_arg1.arguments[0]
284 |                 exp = Expression(maximum, [x])
285 |                 body.append(Statement(statement.variable, exp))
286 |                 variable_to_expression[statement.variable] = exp
287 |                 continue
288 | 
289 |         body.append(statement)
290 |         variable_to_expression[statement.variable] = statement.expression
291 | 
292 |     program.body = body
293 |     return program
294 | 


--------------------------------------------------------------------------------
/generate_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder generate-dataset",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  drive.mount('/gdrive')\n",
 41 |         "  runtime = \"host\"\n",
 42 |         "except:\n",
 43 |         "  runtime = \"local\""
 44 |       ],
 45 |       "execution_count": 0,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "_S457sT6QMUr",
 52 |         "colab_type": "text"
 53 |       },
 54 |       "source": [
 55 |         "### Parameters"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "metadata": {
 61 |         "id": "2LYvG4iCQUwh",
 62 |         "colab_type": "code",
 63 |         "colab": {}
 64 |       },
 65 |       "source": [
 66 |         "#@title Parameters\n",
 67 |         "#@markdown |Name            |Description|\n",
 68 |         "#@markdown |:---            |:---|\n",
 69 |         "#@markdown |`seed`|The random seed|\n",
 70 |         "seed = 3984 #@param {type: \"number\"}\n",
 71 |         "\n",
 72 |         "#@markdown ### `deep-coder` Repositories\n",
 73 |         "#@markdown |Name            |Description|\n",
 74 |         "#@markdown |:---            |:---|\n",
 75 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 76 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 77 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 78 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 79 |         "\n",
 80 |         "#@markdown ### Dataset Configurations\n",
 81 |         "#@markdown |Name          |Description|\n",
 82 |         "#@markdown |:---          |:---|\n",
 83 |         "#@markdown |`num_dataset` |The total number of programs in the dataset. If it is -1, the program will enumerate all valid source code.|\n",
 84 |         "#@markdown |`num_valid`   |The number of programs used for validation.|\n",
 85 |         "#@markdown |`value_range` |The largest absolute value used in the dataset.|\n",
 86 |         "#@markdown |`max_list_length` |The maximum length of lists used in the dataset.|\n",
 87 |         "#@markdown |`num_examples`|The number of I/O examples per program|\n",
 88 |         "#@markdown |`min_length`  |The minimum length of the program body|\n",
 89 |         "#@markdown |`max_length`  |The maximum length of the program body|\n",
 90 |         "#@markdown |`num_examples_for_pruning`|The number of examples used to prune the identical programs.|\n",
 91 |         "num_dataset = -1 #@param {type: \"number\"}\n",
 92 |         "num_valid = 10 #@param {type: \"number\"}\n",
 93 |         "value_range = 256 #@param {type: \"number\"}\n",
 94 |         "max_list_length = 20 #@param {type: \"number\"}\n",
 95 |         "num_examples = 5 #@param {type: \"number\"}\n",
 96 |         "min_length = 1 #@param {type: \"number\"}\n",
 97 |         "max_length = 2 #@param {type: \"number\"}\n",
 98 |         "num_examples_for_pruning = 100 #@param {type: \"number\"}\n",
 99 |         "\n",
100 |         "#@markdown ### Filepath\n",
101 |         "#@markdown |Name                   |Description|\n",
102 |         "#@markdown |:---                   |:---|\n",
103 |         "#@markdown |`destination_dir_path` |The directory of the directory that will contain the dataset.|\n",
104 |         "destination_dir_path = \"dataset/\" #@param {type: \"string\"}\n"
105 |       ],
106 |       "execution_count": 0,
107 |       "outputs": []
108 |     },
109 |     {
110 |       "cell_type": "markdown",
111 |       "metadata": {
112 |         "id": "_BembldCdOO1",
113 |         "colab_type": "text"
114 |       },
115 |       "source": [
116 |         "### Setup\n",
117 |         "* Fix the random seed\n",
118 |         "* Download the codebase (when using the host runtime)\n",
119 |         "  1. Clone git repository and move to the specified branch\n",
120 |         "  2. Initialize submodule\n",
121 |         "  3. Build the `search` tool\n",
122 |         "  4. Install chainer and cupy\n",
123 |         "* Remove the temporary file"
124 |       ]
125 |     },
126 |     {
127 |       "cell_type": "code",
128 |       "metadata": {
129 |         "id": "GwjlAkY1fR5j",
130 |         "colab_type": "code",
131 |         "colab": {}
132 |       },
133 |       "source": [
134 |         "import numpy as np\n",
135 |         "import random\n",
136 |         "\n",
137 |         "SEED_MAX = 2**32 - 1\n",
138 |         "\n",
139 |         "root_rng = np.random.RandomState(seed)\n",
140 |         "random.seed(root_rng.randint(SEED_MAX))\n",
141 |         "np.random.seed(root_rng.randint(SEED_MAX))"
142 |       ],
143 |       "execution_count": 0,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "cell_type": "code",
148 |       "metadata": {
149 |         "id": "FIZJmuz8QFn_",
150 |         "colab_type": "code",
151 |         "colab": {}
152 |       },
153 |       "source": [
154 |         "if runtime == \"host\":\n",
155 |         "  %cd /content\n",
156 |         "  !rm -rf deep-coder\n",
157 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
158 |         "  %cd deep-coder\n",
159 |         "  !git checkout origin/$branch_name\n",
160 |         "  !git submodule init\n",
161 |         "  !git submodule update\n",
162 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
163 |         "  !curl https://colab.chainer.org/install | sh -\n",
164 |         "  !pip install tqdm"
165 |       ],
166 |       "execution_count": 0,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "metadata": {
172 |         "colab_type": "code",
173 |         "id": "alNTHkTVFokT",
174 |         "colab": {}
175 |       },
176 |       "source": [
177 |         "!rm -rf ./dataset.pickle"
178 |       ],
179 |       "execution_count": 0,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "cell_type": "markdown",
184 |       "metadata": {
185 |         "id": "Oz7sdzxUi70b",
186 |         "colab_type": "text"
187 |       },
188 |       "source": [
189 |         "### Generate Dataset\n",
190 |         "* Generate the total dataset\n",
191 |         "* Divide the dataset into `train` and `valid`."
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "code",
196 |       "metadata": {
197 |         "id": "h7kdglcUjDTQ",
198 |         "colab_type": "code",
199 |         "colab": {}
200 |       },
201 |       "source": [
202 |         "import numpy as np\n",
203 |         "import os\n",
204 |         "from tqdm import tqdm_notebook as tqdm\n",
205 |         "\n",
206 |         "from src.dsl import to_function, Program\n",
207 |         "from src.deepcoder_utils import generate_io_samples\n",
208 |         "from src.generate_dataset import generate_dataset, DatasetSpec, EquivalenceCheckingSpec, IteratorDecorator\n",
209 |         "from src.program_simplifier import remove_redundant_variables, remove_redundant_expressions, remove_dependency_between_variables\n",
210 |         "\n",
211 |         "LINQ, _ = generate_io_samples.get_language(value_range)\n",
212 |         "LINQ = [f for f in LINQ if not \"IDT\" in f.src]\n",
213 |         "\n",
214 |         "MINIMUM = to_function([f for f in LINQ if f.src == \"MINIMUM\"][0])\n",
215 |         "MAXIMUM = to_function([f for f in LINQ if f.src == \"MAXIMUM\"][0])\n",
216 |         "\n",
217 |         "\n",
218 |         "def simplify(program):\n",
219 |         "    program = remove_redundant_expressions(program)\n",
220 |         "    program = remove_redundant_variables(program)\n",
221 |         "    program = remove_dependency_between_variables(program, MINIMUM, MAXIMUM)\n",
222 |         "    return program\n",
223 |         "\n",
224 |         "\n",
225 |         "# TODO: tqdm_notebook does not work in a local runtime\n",
226 |         "program_iterator = lambda iterator: tqdm(iterator, desc=\"Program Generation\")\n",
227 |         "entry_iterator = lambda iterator: tqdm(iterator, desc=\"Prune Entries\")\n",
228 |         "decorator = IteratorDecorator(program_iterator, entry_iterator)\n",
229 |         "\n",
230 |         "generate_dataset(LINQ,\n",
231 |         "             DatasetSpec(value_range, max_list_length,\n",
232 |         "                         num_examples, min_length, max_length),\n",
233 |         "             EquivalenceCheckingSpec(0, num_examples_for_pruning, np.random.RandomState(\n",
234 |         "                 root_rng.randint(SEED_MAX))),\n",
235 |         "             \"./dataset.pickle\", num_dataset if num_dataset > 0 else None,\n",
236 |         "             simplify=simplify, decorator=decorator)\n"
237 |       ],
238 |       "execution_count": 0,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "metadata": {
244 |         "id": "GUwjz07KZPld",
245 |         "colab_type": "code",
246 |         "colab": {}
247 |       },
248 |       "source": [
249 |         "import pickle\n",
250 |         "import chainer as ch\n",
251 |         "from src.dataset import Dataset\n",
252 |         "\n",
253 |         "if not os.path.exists(destination_dir_path):\n",
254 |         "    os.makedirs(destination_dir_path)\n",
255 |         "\n",
256 |         "with open(\"./dataset.pickle\", \"rb\") as f:\n",
257 |         "    dataset: Dataset = pickle.load(f)\n",
258 |         "\n",
259 |         "num_valid = num_valid\n",
260 |         "num_train = len(dataset.dataset) - num_valid\n",
261 |         "\n",
262 |         "train, valid = ch.datasets.split_dataset_random(\n",
263 |         "    dataset.dataset, num_train, seed=root_rng.randint(SEED_MAX))\n",
264 |         "\n",
265 |         "with open(os.path.join(destination_dir_path, \"train.pickle\"), \"wb\") as f:\n",
266 |         "    pickle.dump(Dataset(ch.datasets.TupleDataset(\n",
267 |         "        list([d[0] for d in train])), dataset.metadata), f)\n",
268 |         "with open(os.path.join(destination_dir_path, \"valid.pickle\"), \"wb\") as f:\n",
269 |         "    pickle.dump(Dataset(ch.datasets.TupleDataset(\n",
270 |         "        list([d[0] for d in valid])), dataset.metadata), f)\n"
271 |       ],
272 |       "execution_count": 0,
273 |       "outputs": []
274 |     },
275 |     {
276 |       "cell_type": "markdown",
277 |       "metadata": {
278 |         "id": "YWufhkoaw9Bq",
279 |         "colab_type": "text"
280 |       },
281 |       "source": [
282 |         "### Teardown\n",
283 |         "* Remove the temporary file"
284 |       ]
285 |     },
286 |     {
287 |       "cell_type": "code",
288 |       "metadata": {
289 |         "id": "D-WYlqxVkO5i",
290 |         "colab_type": "code",
291 |         "colab": {}
292 |       },
293 |       "source": [
294 |         "!rm dataset.pickle"
295 |       ],
296 |       "execution_count": 0,
297 |       "outputs": []
298 |     }
299 |   ]
300 | }


--------------------------------------------------------------------------------
/src/generate_dataset.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import copy
  3 | import pickle
  4 | import os
  5 | import contextlib
  6 | import numpy as np
  7 | import chainer as ch
  8 | from typing import List, Tuple, Union, Dict, Callable, Iterator
  9 | from .dataset import Primitive, Example, Entry, Dataset, dataset_metadata
 10 | from .deepcoder_utils import generate_io_samples
 11 | from .dsl import Function, Program, Type, to_function, Signature
 12 | from .program_simplifier import normalize
 13 | from .program_generator import programs, random_programs
 14 | 
 15 | 
 16 | @dataclasses.dataclass
 17 | class DatasetSpec:
 18 |     """
 19 |     The specification of the dataset
 20 | 
 21 |     Attribute
 22 |     ---------
 23 |     value_range : int
 24 |     max_list_length : int
 25 |     num_examples : int
 26 |     min_program_length : int
 27 |     max_program_length : int
 28 |     """
 29 |     value_range: int
 30 |     max_list_length: int
 31 |     num_examples: int
 32 |     min_program_length: int
 33 |     max_program_length: int
 34 | 
 35 | 
 36 | @dataclasses.dataclass
 37 | class EquivalenceCheckingSpec:
 38 |     """
 39 |     The specification used to check equivalence of programs
 40 | 
 41 |     Attribute
 42 |     ---
 43 |     ratio_of_examples : float
 44 |     num_of_examples : int
 45 |     rng : np.random.RandomState or None
 46 |     """
 47 |     ratio_of_examples: float
 48 |     num_of_examples: int
 49 |     rng: Union[np.random.RandomState, None]
 50 | 
 51 | 
 52 | SimplifyFunction = Callable[[Program], Program]
 53 | 
 54 | 
 55 | @dataclasses.dataclass
 56 | class IteratorDecorator:
 57 |     """
 58 |     The functions to wrap the iterators (such as tqdm)
 59 | 
 60 |     Attributes:
 61 |     program_decorator : Callable[[Iterator], Iterator]
 62 |         The function to generate a decorator of an iterator of programs
 63 |     entry_decorator : Callable[[Iterator], Iterator]
 64 |         The function to generate a decorator of an iterator of entries
 65 |     """
 66 | 
 67 |     program_decorator: Callable[[Iterator], Iterator]
 68 |     entry_decorator: Callable[[Iterator], Iterator]
 69 | 
 70 | def generate_dataset(functions: List[generate_io_samples.Function], spec: DatasetSpec,
 71 |                      equivalence_spec: EquivalenceCheckingSpec,
 72 |                      destination: str,
 73 |                      num_dataset: Union[None, int] = None,
 74 |                      simplify: Union[None, SimplifyFunction] = None,
 75 |                      decorator: Union[None, IteratorDecorator] = None):
 76 |     """
 77 |     Generate dataset to the file
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     functions : list of generate_io_samples.Function
 82 |         The set of functions that can be used in the dataset
 83 |     spec : DatasetSpec
 84 |         The specification of generated dataset
 85 |     equivalence_spec: EquivalenceCheckingSpec
 86 |         The specification used to check equivalence of programs
 87 |     destination : str
 88 |         The destination of the dataset file
 89 |     num_dataset : int or None
 90 |         The number of dataset to be created.
 91 |         If this argument is None, the function enumerate all source code
 92 |     simplify : function or None
 93 |         The function to simplify the source code
 94 |     decorator: IteratorDecorator or None
 95 |         The decorator of iterators. It is maily used to show the progress (e.g., tqdm)
 96 | 
 97 |     Notes
 98 |     -----
 99 |     Currently this function generates and prunes source code in memory.
100 |     It might be a problem if the program size is large.
101 |     """
102 | 
103 |     @dataclasses.dataclass
104 |     class IntermidiateEntry:
105 |         source_code: str
106 |         program: generate_io_samples.Program
107 |         examples: List[Example]
108 |         attribute: Dict[str, bool]
109 | 
110 |     def simplify_and_normalize(program: Program) -> Program:
111 |         while True:
112 |             p_old = program.to_string()
113 |             if simplify is not None:
114 |                 program = simplify(program)
115 | 
116 |             if program.to_string() == p_old:
117 |                 break
118 |         program = normalize(program)
119 |         return program
120 | 
121 |     def get_signature(program: Program):
122 |         input = []
123 |         for i in program.inputs:
124 |             input.append(i.t)
125 |         output = program.body[-1].expression.function.signature.output_type if len(
126 |             program.body) > 0 else None
127 |         return Signature(input, output)
128 | 
129 |     functions_dsl = [to_function(f) for f in functions]
130 |     invalid_program = set()
131 |     entries = dict()  # Signature -> dict(str -> IntermidiateEntry)
132 | 
133 |     def generate_intermidiate_entry(program: Program) -> Union[None, IntermidiateEntry]:
134 |         # last newline should be removed to compile source code
135 |         code = program.to_string()[:-1]
136 | 
137 |         # Compile the source code
138 |         with contextlib.redirect_stdout(None):  # ignore stdout
139 |             p = generate_io_samples.compile(
140 |                 code, V=spec.value_range, L=spec.max_list_length)
141 |         if p is None:
142 |             # Compilation is failed
143 |             return None
144 | 
145 |         try:
146 |             # Generate IO examples
147 |             with contextlib.redirect_stdout(None):  # ignore stdout
148 |                 examples = generate_io_samples.generate_IO_examples(
149 |                     p, N=spec.num_examples, L=spec.max_list_length, V=spec.value_range)
150 |         except ValueError:
151 |             return None
152 | 
153 |         # Generate binary attribute
154 |         ss = set()
155 |         for statement in program.body:
156 |             for symbol in statement.expression.function.name.split(" "):
157 |                 ss.add(symbol)
158 |         attribute = dict()
159 |         for f in functions_dsl:
160 |             for symbol in f.name.split(" "):
161 |                 if not symbol in attribute:
162 |                     attribute[symbol] = False
163 |                 attribute[symbol] |= symbol in ss
164 | 
165 |         return IntermidiateEntry(code, p, list(map(lambda x: Example(x[0], x[1]), examples)), attribute)
166 | 
167 |     if num_dataset is None:
168 |         # Enumerate source code
169 |         d = decorator.program_decorator if decorator is not None else lambda x: x
170 |         for program in d(programs(functions_dsl, spec.min_program_length, spec.max_program_length)):
171 |             program = simplify_and_normalize(program)  # Simplify the program
172 |             if not (spec.min_program_length <= len(program.body) <= spec.max_program_length):
173 |                 # If the length of simplified program is out of range, discard the program
174 |                 continue
175 | 
176 |             signature = get_signature(program)
177 |             if not signature in entries:
178 |                 entries[signature] = dict()
179 | 
180 |             if program.to_string() in invalid_program:
181 |                 # Generating the entry for this program was failed in the past
182 |                 continue
183 | 
184 |             entry = generate_intermidiate_entry(program)
185 |             if entry is None:
186 |                 invalid_program.add(program.to_string())
187 |                 continue
188 |             if entry.source_code in entries[signature]:
189 |                 # the program is already added to the dataset
190 |                 continue
191 | 
192 |             entries[signature][entry.source_code] = entry
193 | 
194 |         dataset = []
195 |         # Prune entries
196 |         d = decorator.entry_decorator if decorator is not None else lambda x: x
197 |         rng = equivalence_spec.rng if equivalence_spec.rng is not None else np.random
198 |         for signature, ientries in d(entries.items()):
199 |             examples: List[List[Primitive]] = list()
200 |             # Extract examples for checking equivalence
201 |             num = max(
202 |                 1,
203 |                 equivalence_spec.num_of_examples,
204 |                 int(len(ientries) * spec.num_examples * equivalence_spec.ratio_of_examples))
205 |             num = min(num, len(ientries) * spec.num_examples)
206 | 
207 |             from_all_entries = num // len(ientries)
208 |             from_partial_entries = num % len(ientries)
209 | 
210 |             # Extract examples from all entries
211 |             not_used = dict()  # str -> [int]
212 |             for entry in ientries.values():
213 |                 indexes = set(rng.choice(list(range(spec.num_examples)),
214 |                                         from_all_entries, replace=False))
215 |                 for index in indexes:
216 |                     examples.append(entry.examples[index].inputs)
217 |                 not_used[entry.source_code] = [i for i in range(
218 |                     spec.num_examples) if not (i in indexes)]
219 |             # Extract examples from partial entries
220 |             if from_partial_entries != 0:
221 |                 for entry in rng.choice(list(ientries.values()), from_partial_entries, replace=False):
222 |                     index = rng.choice(not_used[entry.source_code])
223 |                     examples.append(entry.examples[index].inputs)
224 | 
225 |             # Execute programs
226 |             es = dict()  # Tuple[Primitive] -> IntermidiateEntry
227 |             for entry in ientries.values():
228 |                 result = []
229 |                 for example in examples:
230 |                     output = entry.program.fun(example)
231 |                     if entry.program.out == int:
232 |                         result.append(output)
233 |                     else:
234 |                         result.append(tuple(output))
235 |                 result = tuple(result)
236 |                 if not result in es:
237 |                     es[result] = entry
238 |                 else:
239 |                     # If there is a equivalent program, prune the longer program
240 |                     l1 = len(es[result].source_code.split("\n"))
241 |                     l2 = len(entry.source_code.split("\n"))
242 |                     if l1 > l2:
243 |                         es[result] = entry
244 | 
245 |             # Create dataset instance
246 |             for entry in es.values():
247 |                 dataset.append(Entry(
248 |                     entry.source_code, entry.examples, entry.attribute
249 |                 ))
250 |     else:
251 |         # Generate the fixed number of the dataset
252 |         entries = dict()
253 |         n_entries = 0
254 |         d = decorator.program_decorator if decorator is not None else lambda x: x
255 |         for program in d(random_programs(functions_dsl, spec.min_program_length, spec.max_program_length)):
256 |             program = simplify_and_normalize(program)  # Simplify the program
257 |             if not (spec.min_program_length <= len(program.body) <= spec.max_program_length):
258 |                 # If the length of simplified program is out of range, discard the program
259 |                 continue
260 | 
261 |             signature = get_signature(program)
262 |             if not signature in entries:
263 |                 entries[signature] = dict()
264 | 
265 |             if program.to_string() in invalid_program:
266 |                 # Generating the entry for this program was failed in the past
267 |                 continue
268 | 
269 |             entry = generate_intermidiate_entry(program)
270 |             if entry is None:
271 |                 invalid_program.add(program.to_string())
272 |                 continue
273 |             if entry.source_code in entries[signature]:
274 |                 # the program is already added to the dataset
275 |                 continue
276 | 
277 |             # Prune the program
278 |             def prune_program():
279 |                 for e in entries[signature].values():
280 |                     for example in entry.examples:
281 |                         output = e.program.fun(example.inputs)
282 |                         if example.output == output:
283 |                             # The `entry` and `e` are identical
284 |                             l1 = len(entry.source_code.split("\n"))
285 |                             l2 = len(e.source_code.split("\n"))
286 |                             if l1 < l2:
287 |                                 return l2
288 |                             else:
289 |                                 return "Ignore"
290 |                 return "Add"
291 |             
292 |             pruned_result =  prune_program()
293 |             if pruned_result == "Add":
294 |                 n_entries += 1
295 |                 entries[signature][entry.source_code] = entry
296 |             elif pruned_result != "Ignore":
297 |                 del entries[signature][entry.source_code]
298 |                 entries[signature][entry.source_code] = entry
299 |             
300 |             if n_entries >= num_dataset:
301 |                 break
302 | 
303 |         dataset = []
304 |         # Create dataset instance
305 |         for es in entries.values():
306 |             for entry in es.values():
307 |                 dataset.append(Entry(
308 |                     entry.source_code, entry.examples, entry.attribute
309 |                 ))
310 | 
311 |     # Create metadata
312 |     dataset = ch.datasets.TupleDataset(dataset)
313 |     metadata = dataset_metadata(
314 |         dataset, spec.value_range, spec.max_list_length)
315 | 
316 |     # Dump the dataset to the file
317 |     with open(destination, "wb") as f:
318 |         pickle.dump(Dataset(dataset, metadata), f)
319 | 


--------------------------------------------------------------------------------
/examples/medium/generate_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "generate_dataset (length=3)",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "metadata": {
 20 |         "id": "yje9hqtcUQ_f",
 21 |         "colab_type": "text"
 22 |       },
 23 |       "source": [
 24 |         "### Initialization\n",
 25 |         "* Check whether the runtime is host or local.\n",
 26 |         "* Mount Google Drive when using the host runtime."
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "metadata": {
 32 |         "id": "FwqGy_GyUQnw",
 33 |         "colab_type": "code",
 34 |         "colab": {}
 35 |       },
 36 |       "source": [
 37 |         "try:\n",
 38 |         "  from google.colab import drive\n",
 39 |         "  drive.mount('/gdrive')\n",
 40 |         "  runtime = \"host\"\n",
 41 |         "except:\n",
 42 |         "  runtime = \"local\""
 43 |       ],
 44 |       "execution_count": 0,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "_S457sT6QMUr",
 51 |         "colab_type": "text"
 52 |       },
 53 |       "source": [
 54 |         "### Parameters"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "metadata": {
 60 |         "id": "2LYvG4iCQUwh",
 61 |         "colab_type": "code",
 62 |         "colab": {}
 63 |       },
 64 |       "source": [
 65 |         "#@title Parameters\n",
 66 |         "#@markdown |Name            |Description|\n",
 67 |         "#@markdown |:---            |:---|\n",
 68 |         "#@markdown |`seed`|The random seed|\n",
 69 |         "seed = 3984 #@param {type: \"number\"}\n",
 70 |         "\n",
 71 |         "#@markdown ### `deep-coder` Repositories\n",
 72 |         "#@markdown |Name            |Description|\n",
 73 |         "#@markdown |:---            |:---|\n",
 74 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 75 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 76 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 77 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 78 |         "\n",
 79 |         "#@markdown ### Dataset Configurations\n",
 80 |         "#@markdown |Name          |Description|\n",
 81 |         "#@markdown |:---          |:---|\n",
 82 |         "#@markdown |`num_dataset` |The total number of programs in the dataset. If it is -1, the program will enumerate all valid source code.|\n",
 83 |         "#@markdown |`num_valid`   |The number of programs used for validation.|\n",
 84 |         "#@markdown |`value_range` |The largest absolute value used in the dataset.|\n",
 85 |         "#@markdown |`max_list_length` |The maximum length of lists used in the dataset.|\n",
 86 |         "#@markdown |`num_examples`|The number of I/O examples per program|\n",
 87 |         "#@markdown |`min_length`  |The minimum length of the program body|\n",
 88 |         "#@markdown |`max_length`  |The maximum length of the program body|\n",
 89 |         "#@markdown |`num_examples_for_pruning`|The number of examples used to prune the identical programs.|\n",
 90 |         "num_dataset = -1 #@param {type: \"number\"}\n",
 91 |         "num_valid = 500 #@param {type: \"number\"}\n",
 92 |         "value_range = 256 #@param {type: \"number\"}\n",
 93 |         "max_list_length = 20 #@param {type: \"number\"}\n",
 94 |         "num_examples = 5 #@param {type: \"number\"}\n",
 95 |         "min_length = 1 #@param {type: \"number\"}\n",
 96 |         "max_length = 3 #@param {type: \"number\"}\n",
 97 |         "num_examples_for_pruning = 100 #@param {type: \"number\"}\n",
 98 |         "\n",
 99 |         "#@markdown ### Filepath\n",
100 |         "#@markdown |Name                   |Description|\n",
101 |         "#@markdown |:---                   |:---|\n",
102 |         "#@markdown |`destination_dir_path` |The directory of the directory that will contain the dataset.|\n",
103 |         "destination_dir_path = \"./dataset/length_3\" #@param {type: \"string\"}\n"
104 |       ],
105 |       "execution_count": 0,
106 |       "outputs": []
107 |     },
108 |     {
109 |       "cell_type": "markdown",
110 |       "metadata": {
111 |         "id": "_BembldCdOO1",
112 |         "colab_type": "text"
113 |       },
114 |       "source": [
115 |         "### Setup\n",
116 |         "* Fix the random seed\n",
117 |         "* Download the codebase (when using the host runtime)\n",
118 |         "  1. Clone git repository and move to the specified branch\n",
119 |         "  2. Initialize submodule\n",
120 |         "  3. Build the `search` tool\n",
121 |         "  4. Install chainer and cupy\n",
122 |         "* Remove the temporary file"
123 |       ]
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "metadata": {
128 |         "id": "GwjlAkY1fR5j",
129 |         "colab_type": "code",
130 |         "colab": {}
131 |       },
132 |       "source": [
133 |         "import numpy as np\n",
134 |         "import random\n",
135 |         "\n",
136 |         "SEED_MAX = 2**32 - 1\n",
137 |         "\n",
138 |         "root_rng = np.random.RandomState(seed)\n",
139 |         "random.seed(root_rng.randint(SEED_MAX))\n",
140 |         "np.random.seed(root_rng.randint(SEED_MAX))"
141 |       ],
142 |       "execution_count": 0,
143 |       "outputs": []
144 |     },
145 |     {
146 |       "cell_type": "code",
147 |       "metadata": {
148 |         "id": "FIZJmuz8QFn_",
149 |         "colab_type": "code",
150 |         "colab": {}
151 |       },
152 |       "source": [
153 |         "if runtime == \"host\":\n",
154 |         "  %cd /content\n",
155 |         "  !rm -rf deep-coder\n",
156 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
157 |         "  %cd deep-coder\n",
158 |         "  !git checkout origin/$branch_name\n",
159 |         "  !git submodule init\n",
160 |         "  !git submodule update\n",
161 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
162 |         "  !curl https://colab.chainer.org/install | sh -\n",
163 |         "  !pip install tqdm"
164 |       ],
165 |       "execution_count": 0,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "cell_type": "code",
170 |       "metadata": {
171 |         "colab_type": "code",
172 |         "id": "alNTHkTVFokT",
173 |         "colab": {}
174 |       },
175 |       "source": [
176 |         "!rm -rf ./dataset.pickle"
177 |       ],
178 |       "execution_count": 0,
179 |       "outputs": []
180 |     },
181 |     {
182 |       "cell_type": "markdown",
183 |       "metadata": {
184 |         "id": "Oz7sdzxUi70b",
185 |         "colab_type": "text"
186 |       },
187 |       "source": [
188 |         "### Generate Dataset\n",
189 |         "* Generate the total dataset\n",
190 |         "* Divide the dataset into `train` and `valid`."
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "metadata": {
196 |         "id": "h7kdglcUjDTQ",
197 |         "colab_type": "code",
198 |         "outputId": "5a3593f2-a0f9-4a6a-8725-494f7f583dfc",
199 |         "colab": {
200 |           "base_uri": "https://localhost:8080/",
201 |           "height": 119
202 |         }
203 |       },
204 |       "source": [
205 |         "import numpy as np\n",
206 |         "import os\n",
207 |         "from tqdm import tqdm_notebook as tqdm\n",
208 |         "\n",
209 |         "from src.dsl import to_function, Program\n",
210 |         "from src.deepcoder_utils import generate_io_samples\n",
211 |         "from src.generate_dataset import generate_dataset, DatasetSpec, EquivalenceCheckingSpec, IteratorDecorator\n",
212 |         "from src.program_simplifier import remove_redundant_variables, remove_redundant_expressions, remove_dependency_between_variables\n",
213 |         "\n",
214 |         "LINQ, _ = generate_io_samples.get_language(value_range)\n",
215 |         "LINQ = [f for f in LINQ if not \"IDT\" in f.src]\n",
216 |         "\n",
217 |         "MINIMUM = to_function([f for f in LINQ if f.src == \"MINIMUM\"][0])\n",
218 |         "MAXIMUM = to_function([f for f in LINQ if f.src == \"MAXIMUM\"][0])\n",
219 |         "\n",
220 |         "\n",
221 |         "def simplify(program):\n",
222 |         "    program = remove_redundant_expressions(program)\n",
223 |         "    program = remove_redundant_variables(program)\n",
224 |         "    program = remove_dependency_between_variables(program, MINIMUM, MAXIMUM)\n",
225 |         "    return program\n",
226 |         "\n",
227 |         "\n",
228 |         "# TODO: tqdm_notebook does not work in a local runtime\n",
229 |         "program_iterator = lambda iterator: tqdm(iterator, desc=\"Program Generation\")\n",
230 |         "entry_iterator = lambda iterator: tqdm(iterator, desc=\"Prune Entries\")\n",
231 |         "decorator = IteratorDecorator(program_iterator, entry_iterator)\n",
232 |         "\n",
233 |         "generate_dataset(LINQ,\n",
234 |         "             DatasetSpec(value_range, max_list_length,\n",
235 |         "                         num_examples, min_length, max_length),\n",
236 |         "             EquivalenceCheckingSpec(0, num_examples_for_pruning, np.random.RandomState(\n",
237 |         "                 root_rng.randint(SEED_MAX))),\n",
238 |         "             \"./dataset.pickle\", num_dataset if num_dataset > 0 else None,\n",
239 |         "             simplify=simplify, decorator=decorator)\n"
240 |       ],
241 |       "execution_count": 0,
242 |       "outputs": [
243 |         {
244 |           "output_type": "display_data",
245 |           "data": {
246 |             "text/plain": [
247 |               "HBox(children=(IntProgress(value=1, bar_style='info', description='Program Generation', max=1, style=ProgressS…"
248 |             ],
249 |             "application/vnd.jupyter.widget-view+json": {
250 |               "version_major": 2,
251 |               "version_minor": 0,
252 |               "model_id": "a40902c9288b4d4f9fdb81add81da30a"
253 |             }
254 |           },
255 |           "metadata": {
256 |             "tags": []
257 |           }
258 |         },
259 |         {
260 |           "output_type": "stream",
261 |           "text": [
262 |             "\n"
263 |           ],
264 |           "name": "stdout"
265 |         },
266 |         {
267 |           "output_type": "display_data",
268 |           "data": {
269 |             "text/plain": [
270 |               "HBox(children=(IntProgress(value=0, description='Prune Entries', max=37, style=ProgressStyle(description_width…"
271 |             ],
272 |             "application/vnd.jupyter.widget-view+json": {
273 |               "version_major": 2,
274 |               "version_minor": 0,
275 |               "model_id": "76cf0097b3ad49a3a03363e7dcd83559"
276 |             }
277 |           },
278 |           "metadata": {
279 |             "tags": []
280 |           }
281 |         },
282 |         {
283 |           "output_type": "stream",
284 |           "text": [
285 |             "/home/mhiroaki/HiroakiMikami/ml-program/DeepCoder_Utils/generate_io_samples.py:123: RuntimeWarning: overflow encountered in long_scalars\n",
286 |             "  j, lambda AB: MUL_bounds(AB[0], AB[1])),\n",
287 |             "/home/mhiroaki/HiroakiMikami/ml-program/DeepCoder_Utils/generate_io_samples.py:105: RuntimeWarning: overflow encountered in long_scalars\n",
288 |             "  Function('SQR',     (int, int), lambda i: i*i,\n"
289 |           ],
290 |           "name": "stderr"
291 |         },
292 |         {
293 |           "output_type": "stream",
294 |           "text": [
295 |             "\n"
296 |           ],
297 |           "name": "stdout"
298 |         }
299 |       ]
300 |     },
301 |     {
302 |       "cell_type": "code",
303 |       "metadata": {
304 |         "id": "GUwjz07KZPld",
305 |         "colab_type": "code",
306 |         "colab": {}
307 |       },
308 |       "source": [
309 |         "import pickle\n",
310 |         "import chainer as ch\n",
311 |         "from src.dataset import Dataset\n",
312 |         "\n",
313 |         "if not os.path.exists(destination_dir_path):\n",
314 |         "    os.makedirs(destination_dir_path)\n",
315 |         "\n",
316 |         "with open(\"./dataset.pickle\", \"rb\") as f:\n",
317 |         "    dataset: Dataset = pickle.load(f)\n",
318 |         "\n",
319 |         "num_valid = num_valid\n",
320 |         "num_train = len(dataset.dataset) - num_valid\n",
321 |         "\n",
322 |         "train, valid = ch.datasets.split_dataset_random(\n",
323 |         "    dataset.dataset, num_train, seed=root_rng.randint(SEED_MAX))\n",
324 |         "\n",
325 |         "with open(os.path.join(destination_dir_path, \"train.pickle\"), \"wb\") as f:\n",
326 |         "    pickle.dump(Dataset(ch.datasets.TupleDataset(\n",
327 |         "        list([d[0] for d in train])), dataset.metadata), f)\n",
328 |         "with open(os.path.join(destination_dir_path, \"valid.pickle\"), \"wb\") as f:\n",
329 |         "    pickle.dump(Dataset(ch.datasets.TupleDataset(\n",
330 |         "        list([d[0] for d in valid])), dataset.metadata), f)\n"
331 |       ],
332 |       "execution_count": 0,
333 |       "outputs": []
334 |     },
335 |     {
336 |       "cell_type": "markdown",
337 |       "metadata": {
338 |         "id": "YWufhkoaw9Bq",
339 |         "colab_type": "text"
340 |       },
341 |       "source": [
342 |         "### Teardown\n",
343 |         "* Remove the temporary file"
344 |       ]
345 |     },
346 |     {
347 |       "cell_type": "code",
348 |       "metadata": {
349 |         "id": "D-WYlqxVkO5i",
350 |         "colab_type": "code",
351 |         "colab": {}
352 |       },
353 |       "source": [
354 |         "!rm dataset.pickle"
355 |       ],
356 |       "execution_count": 0,
357 |       "outputs": []
358 |     }
359 |   ]
360 | }


--------------------------------------------------------------------------------
/train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder train",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "yje9hqtcUQ_f",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "### Initialization\n",
 27 |         "* Check whether the runtime is host or local.\n",
 28 |         "* Mount Google Drive when using the host runtime."
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "metadata": {
 34 |         "id": "FwqGy_GyUQnw",
 35 |         "colab_type": "code",
 36 |         "colab": {}
 37 |       },
 38 |       "source": [
 39 |         "try:\n",
 40 |         "  from google.colab import drive\n",
 41 |         "  drive.mount('/gdrive')\n",
 42 |         "  runtime = \"host\"\n",
 43 |         "except:\n",
 44 |         "  runtime = \"local\""
 45 |       ],
 46 |       "execution_count": 0,
 47 |       "outputs": []
 48 |     },
 49 |     {
 50 |       "cell_type": "markdown",
 51 |       "metadata": {
 52 |         "id": "_S457sT6QMUr",
 53 |         "colab_type": "text"
 54 |       },
 55 |       "source": [
 56 |         "### Parameters"
 57 |       ]
 58 |     },
 59 |     {
 60 |       "cell_type": "code",
 61 |       "metadata": {
 62 |         "colab_type": "code",
 63 |         "id": "QN-4eF51DNqt",
 64 |         "colab": {}
 65 |       },
 66 |       "source": [
 67 |         "#@title Parameters\n",
 68 |         "#@markdown |Name            |Description|\n",
 69 |         "#@markdown |:---            |:---|\n",
 70 |         "#@markdown |`seed`|The random seed|\n",
 71 |         "seed = 3984 #@param {type: \"number\"}\n",
 72 |         "\n",
 73 |         "#@markdown ### `deep-coder` Repositories\n",
 74 |         "#@markdown |Name            |Description|\n",
 75 |         "#@markdown |:---            |:---|\n",
 76 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 77 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 78 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 79 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 80 |         "\n",
 81 |         "#@markdown ### Model Parameters\n",
 82 |         "#@markdown |Name               |Description|\n",
 83 |         "#@markdown |:---               |:---|\n",
 84 |         "#@markdown |`n_embed`          |The dimension of integer embeddings|\n",
 85 |         "#@markdown |`n_units`          |The number of units in the hidden layers|\n",
 86 |         "#@markdown |`num_hidden_layers`|The number of the hidden layers|\n",
 87 |         "n_embed = 20 #@param {type: \"number\"}\n",
 88 |         "n_units = 256 #@param {type: \"number\"}\n",
 89 |         "num_hidden_layers = 3 #@param {type: \"number\"}\n",
 90 |         "\n",
 91 |         "#@markdown ### Training Settings\n",
 92 |         "#@markdown |Name                |Description|\n",
 93 |         "#@markdown |:---                |:---|\n",
 94 |         "#@markdown |`batch_size`        |The minibatch size|\n",
 95 |         "#@markdown |`weight_label_false`|The weight for the loss value in the case of attribute=False. `-1` means that using the original loss function|\n",
 96 |         "#@markdown |`num_epochs`        |The numer of epoch|\n",
 97 |         "#@markdown |`ratio_test`        |The ratio of entries for testing|\n",
 98 |         "#@markdown |`num_train`         |The number of entries used for training|\n",
 99 |         "batch_size = 32 #@param {type: \"number\"}\n",
100 |         "weight_label_false = -1 #@param {type: \"number\"}\n",
101 |         "num_epochs = 10 #@param {type: \"number\"}\n",
102 |         "ratio_test = 0 #@param {type: \"number\"}\n",
103 |         "num_train = 0 #@param {type: \"number\"}\n",
104 |         "\n",
105 |         "#@markdown ### Validation Settings\n",
106 |         "#@markdown |Name                |Description|\n",
107 |         "#@markdown |:---                |:---|\n",
108 |         "#@markdown |`timeout_second`    ||\n",
109 |         "#@markdown |`max_program_length`|The maximum length of the program|\n",
110 |         "timeout_second = 1 #@param {type: \"number\"}\n",
111 |         "max_program_length = 2 #@param {type: \"number\"}\n",
112 |         "\n",
113 |         "#@markdown ### Other Settings\n",
114 |         "#@markdown |Name    |Description|\n",
115 |         "#@markdown |:---    |:---|\n",
116 |         "#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|\n",
117 |         "device = 0 #@param {type: \"number\"}\n",
118 |         "\n",
119 |         "#@markdown ### Filepath\n",
120 |         "#@markdown |Name                |Description|\n",
121 |         "#@markdown |:---                |:---|\n",
122 |         "#@markdown |`train_dataset_path`|The file path of the training dataset.|\n",
123 |         "#@markdown |`valid_dataset_path`|The file path of the validation dataset.|\n",
124 |         "#@markdown |`destination_path`  |The directory of the directory that will contain the training results.|\n",
125 |         "train_dataset_path = \"./dataset/train.pickle\" #@param {type: \"string\"}\n",
126 |         "valid_dataset_path = \"./dataset/valid.pickle\" #@param {type: \"string\"}\n",
127 |         "destination_path = \"./out\" #@param {type: \"string\"}\n",
128 |         "\n"
129 |       ],
130 |       "execution_count": 0,
131 |       "outputs": []
132 |     },
133 |     {
134 |       "cell_type": "markdown",
135 |       "metadata": {
136 |         "id": "_BembldCdOO1",
137 |         "colab_type": "text"
138 |       },
139 |       "source": [
140 |         "### Setup\n",
141 |         "* Fix the random seed\n",
142 |         "* Download the codebase\n",
143 |         "  1. Clone git repository and move to the specified branch\n",
144 |         "  2. Initialize submodule\n",
145 |         "  3. Install chainer and cupy\n"
146 |       ]
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "metadata": {
151 |         "id": "GwjlAkY1fR5j",
152 |         "colab_type": "code",
153 |         "colab": {}
154 |       },
155 |       "source": [
156 |         "import numpy as np\n",
157 |         "import random\n",
158 |         "\n",
159 |         "SEED_MAX = 2**32 - 1\n",
160 |         "\n",
161 |         "root_rng = np.random.RandomState(seed)\n",
162 |         "random.seed(root_rng.randint(SEED_MAX))\n",
163 |         "np.random.seed(root_rng.randint(SEED_MAX))"
164 |       ],
165 |       "execution_count": 0,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "cell_type": "code",
170 |       "metadata": {
171 |         "id": "FIZJmuz8QFn_",
172 |         "colab_type": "code",
173 |         "colab": {}
174 |       },
175 |       "source": [
176 |         "if runtime == \"host\":\n",
177 |         "  %cd /content\n",
178 |         "  !rm -rf deep-coder\n",
179 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
180 |         "  %cd deep-coder\n",
181 |         "  !git checkout origin/$branch_name\n",
182 |         "  !git submodule init\n",
183 |         "  !git submodule update\n",
184 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
185 |         "  !curl https://colab.chainer.org/install | sh -\n",
186 |         "  !pip install tqdm"
187 |       ],
188 |       "execution_count": 0,
189 |       "outputs": []
190 |     },
191 |     {
192 |       "cell_type": "markdown",
193 |       "metadata": {
194 |         "id": "Oz7sdzxUi70b",
195 |         "colab_type": "text"
196 |       },
197 |       "source": [
198 |         "### Train DNN Model\n",
199 |         "* Create `Trainer`\n",
200 |         "* Run training"
201 |       ]
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "metadata": {
206 |         "id": "h7kdglcUjDTQ",
207 |         "colab_type": "code",
208 |         "colab": {}
209 |       },
210 |       "source": [
211 |         "import pickle\n",
212 |         "import os\n",
213 |         "import chainer as ch\n",
214 |         "from chainer import datasets\n",
215 |         "from chainer.training import extensions\n",
216 |         "from src.dataset import EncodedDataset, Dataset\n",
217 |         "import src.train as T\n",
218 |         "from src.model import ModelShapeParameters\n",
219 |         "\n",
220 |         "with open(train_dataset_path, \"rb\") as f:\n",
221 |         "    d: Dataset = pickle.load(f)\n",
222 |         "dataset = d.dataset\n",
223 |         "metadata = d.metadata\n",
224 |         "    \n",
225 |         "\n",
226 |         "if num_train != 0:\n",
227 |         "    num_test = int(num_train *\n",
228 |         "                   (ratio_test if ratio_test is not None else 0.0))\n",
229 |         "    dataset, _ = datasets.split_dataset_random(\n",
230 |         "        dataset, num_train + num_test, seed=root_rng.randint(SEED_MAX))\n",
231 |         "\n",
232 |         "model_shape = ModelShapeParameters(metadata, num_hidden_layers, n_embed, n_units)\n",
233 |         "\n",
234 |         "n_entries = len(dataset)\n",
235 |         "dataset = EncodedDataset(Dataset(dataset, metadata))\n",
236 |         "if ratio_test is None or ratio_test == 0:\n",
237 |         "    train = dataset\n",
238 |         "    test = None\n",
239 |         "else:\n",
240 |         "    train, test = datasets.split_dataset_random(dataset, int(\n",
241 |         "        n_entries * (1.0 - ratio_test)), seed=root_rng.randint(SEED_MAX))\n",
242 |         "\n",
243 |         "train_iter = ch.iterators.SerialIterator(train, batch_size)\n",
244 |         "if test is not None:\n",
245 |         "    test_iter = ch.iterators.SerialIterator(\n",
246 |         "        test, batch_size, repeat=False, shuffle=False)\n",
247 |         "else:\n",
248 |         "    test_iter = None\n",
249 |         "\n",
250 |         "train = T.Training(train_iter, test_iter, destination_path, model_shape, weight_label_false,\n",
251 |         "                   num_epochs, device=device)\n",
252 |         "train.trainer.extend(extensions.LogReport())\n",
253 |         "if test_iter is not None:\n",
254 |         "    train.trainer.extend(extensions.PrintReport(\n",
255 |         "        ['epoch',\n",
256 |         "         'main/loss', 'validation/main/loss',\n",
257 |         "         'main/accuracy', 'main/accuracy_false', 'main/accuracy_true',\n",
258 |         "         'validation/main/accuracy', 'validation/main/accuracy_false', 'validation/main/accuracy_true',\n",
259 |         "         'elapsed_time']))\n",
260 |         "else:\n",
261 |         "    train.trainer.extend(extensions.PrintReport(\n",
262 |         "        ['epoch', 'main/loss', 'main/accuracy', 'main/accuracy_false', 'main/accuracy_true', 'elapsed_time']))\n"
263 |       ],
264 |       "execution_count": 0,
265 |       "outputs": []
266 |     },
267 |     {
268 |       "cell_type": "code",
269 |       "metadata": {
270 |         "id": "Pl4xN2N2kGfo",
271 |         "colab_type": "code",
272 |         "colab": {}
273 |       },
274 |       "source": [
275 |         "train.trainer.run()"
276 |       ],
277 |       "execution_count": 0,
278 |       "outputs": []
279 |     },
280 |     {
281 |       "cell_type": "markdown",
282 |       "metadata": {
283 |         "id": "1Sl37YHR_b6L",
284 |         "colab_type": "text"
285 |       },
286 |       "source": [
287 |         "### Save DNN Model"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "metadata": {
293 |         "id": "D-WYlqxVkO5i",
294 |         "colab_type": "code",
295 |         "colab": {}
296 |       },
297 |       "source": [
298 |         "import os\n",
299 |         "import chainer as ch\n",
300 |         "\n",
301 |         "if not os.path.exists(destination_path):\n",
302 |         "    os.makedirs(destination_path)\n",
303 |         "\n",
304 |         "with open(os.path.join(destination_path, \"model-shape.pickle\"), \"wb\") as f:\n",
305 |         "    pickle.dump(model_shape, f)\n",
306 |         "\n",
307 |         "ch.serializers.save_npz(os.path.join(destination_path, \"model.npz\"), train.predictor)\n"
308 |       ],
309 |       "execution_count": 0,
310 |       "outputs": []
311 |     },
312 |     {
313 |       "cell_type": "markdown",
314 |       "metadata": {
315 |         "colab_type": "text",
316 |         "id": "4IOCX_PXG6sH"
317 |       },
318 |       "source": [
319 |         "### Validate DNN Model"
320 |       ]
321 |     },
322 |     {
323 |       "cell_type": "code",
324 |       "metadata": {
325 |         "colab_type": "code",
326 |         "id": "SsVdGBe4G6sJ",
327 |         "colab": {}
328 |       },
329 |       "source": [
330 |         "import pickle\n",
331 |         "import os\n",
332 |         "import chainer as ch\n",
333 |         "from chainer import datasets\n",
334 |         "from src.dataset import EncodedDataset, Dataset\n",
335 |         "import src.inference as I\n",
336 |         "from src.model import ModelShapeParameters\n",
337 |         "from tqdm import tqdm_notebook as tqdm\n",
338 |         "\n",
339 |         "model = I.InferenceModel(model_shape)\n",
340 |         "ch.serializers.load_npz(os.path.join(destination_path, \"model.npz\"), model.predictor)\n",
341 |         "\n",
342 |         "with open(valid_dataset_path, \"rb\") as f:\n",
343 |         "    dataset: Dataset = pickle.load(f)\n",
344 |         "\n",
345 |         "pred = I.predict_with_neural_network(model_shape, model)\n",
346 |         "\n",
347 |         "results = dict([])\n",
348 |         "num_succ = 0\n",
349 |         "for i, (entry,) in enumerate(tqdm(dataset.dataset)):\n",
350 |         "    result = I.search(\n",
351 |         "        os.path.join(os.getcwd(), \"DeepCoder_Utils\",\n",
352 |         "                     \"enumerative-search\", \"search\"),\n",
353 |         "        timeout_second,\n",
354 |         "        model_shape.dataset_metadata.value_range,\n",
355 |         "        entry.examples,\n",
356 |         "        max_program_length,\n",
357 |         "        pred\n",
358 |         "    )\n",
359 |         "    results[i] = result\n",
360 |         "    if result.is_solved:\n",
361 |         "        num_succ += 1\n",
362 |         "\n",
363 |         "print(\"Solved: {} of {} examples\".format(num_succ, len(dataset.dataset)))\n"
364 |       ],
365 |       "execution_count": 0,
366 |       "outputs": []
367 |     },
368 |     {
369 |       "cell_type": "markdown",
370 |       "metadata": {
371 |         "colab_type": "text",
372 |         "id": "Lhq-S-vcGxUQ"
373 |       },
374 |       "source": [
375 |         "### Save Validation Result"
376 |       ]
377 |     },
378 |     {
379 |       "cell_type": "code",
380 |       "metadata": {
381 |         "id": "5DuxS691_fuT",
382 |         "colab_type": "code",
383 |         "colab": {}
384 |       },
385 |       "source": [
386 |         "import os\n",
387 |         "\n",
388 |         "if not os.path.exists(destination_path):\n",
389 |         "    os.makedirs(destination_path)\n",
390 |         "\n",
391 |         "with open(os.path.join(destination_path, \"result.pickle\"), \"wb\") as f:\n",
392 |         "    pickle.dump(results, f)"
393 |       ],
394 |       "execution_count": 0,
395 |       "outputs": []
396 |     }
397 |   ]
398 | }


--------------------------------------------------------------------------------
/inspect_validation_results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder inspect-validation-results",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  drive.mount('/gdrive')\n",
 41 |         "  runtime = \"host\"\n",
 42 |         "except:\n",
 43 |         "  runtime = \"local\""
 44 |       ],
 45 |       "execution_count": 0,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "_S457sT6QMUr",
 52 |         "colab_type": "text"
 53 |       },
 54 |       "source": [
 55 |         "### Parameters"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "metadata": {
 61 |         "id": "2LYvG4iCQUwh",
 62 |         "colab_type": "code",
 63 |         "colab": {}
 64 |       },
 65 |       "source": [
 66 |         "#@title Parameters\n",
 67 |         "#@markdown |Name            |Description|\n",
 68 |         "#@markdown |:---            |:---|\n",
 69 |         "#@markdown |`seed`|The random seed|\n",
 70 |         "seed = 3984 #@param {type: \"number\"}\n",
 71 |         "\n",
 72 |         "#@markdown ### `deep-coder` Repositories\n",
 73 |         "#@markdown |Name            |Description|\n",
 74 |         "#@markdown |:---            |:---|\n",
 75 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 76 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 77 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 78 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 79 |         "\n",
 80 |         "#@markdown ### Filepathes\n",
 81 |         "#@markdown |Name                |Description|\n",
 82 |         "#@markdown |:---                |:---|\n",
 83 |         "#@markdown |`valid_dataset_path`|The file path of the validation dataset|\n",
 84 |         "#@markdown |`result_pathes`     |The comma separated list of the validation results|\n",
 85 |         "valid_dataset_path = \"dataset/valid.pickle\" #@param {type: \"string\"}\n",
 86 |         "result_pathes = \"out/baseline/result.pickle,out/deep-coder/result.pickle\" #@param {type: \"string\"}\n",
 87 |         "result_path_list = list(result_pathes.split(\",\"))"
 88 |       ],
 89 |       "execution_count": 0,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "cell_type": "markdown",
 94 |       "metadata": {
 95 |         "id": "_BembldCdOO1",
 96 |         "colab_type": "text"
 97 |       },
 98 |       "source": [
 99 |         "### Setup\n",
100 |         "* Fix the random seed\n",
101 |         "* Download the codebase (when using the host runtime)\n",
102 |         "  1. Clone git repository and move to the specified branch\n",
103 |         "  2. Initialize submodule\n",
104 |         "  3. Build the `search` tool\n",
105 |         "  4. Install chainer and cupy"
106 |       ]
107 |     },
108 |     {
109 |       "cell_type": "code",
110 |       "metadata": {
111 |         "id": "GwjlAkY1fR5j",
112 |         "colab_type": "code",
113 |         "colab": {}
114 |       },
115 |       "source": [
116 |         "import numpy as np\n",
117 |         "import random\n",
118 |         "\n",
119 |         "SEED_MAX = 2**32 - 1\n",
120 |         "\n",
121 |         "root_rng = np.random.RandomState(seed)\n",
122 |         "random.seed(root_rng.randint(SEED_MAX))\n",
123 |         "np.random.seed(root_rng.randint(SEED_MAX))"
124 |       ],
125 |       "execution_count": 0,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "metadata": {
131 |         "id": "FIZJmuz8QFn_",
132 |         "colab_type": "code",
133 |         "colab": {}
134 |       },
135 |       "source": [
136 |         "if runtime == \"host\":\n",
137 |         "  %cd /content\n",
138 |         "  !rm -rf deep-coder\n",
139 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
140 |         "  %cd deep-coder\n",
141 |         "  !git checkout origin/$branch_name\n",
142 |         "  !git submodule init\n",
143 |         "  !git submodule update\n",
144 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
145 |         "  !curl https://colab.chainer.org/install | sh -"
146 |       ],
147 |       "execution_count": 0,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "metadata": {
153 |         "id": "Oz7sdzxUi70b",
154 |         "colab_type": "text"
155 |       },
156 |       "source": [
157 |         "### Load Dataset and Results\n",
158 |         "* Load dataset\n",
159 |         "* Load results"
160 |       ]
161 |     },
162 |     {
163 |       "cell_type": "code",
164 |       "metadata": {
165 |         "id": "h7kdglcUjDTQ",
166 |         "colab_type": "code",
167 |         "colab": {}
168 |       },
169 |       "source": [
170 |         "import pickle\n",
171 |         "\n",
172 |         "# Load model\n",
173 |         "with open(valid_dataset_path, \"rb\") as f:\n",
174 |         "    valid_dataset = pickle.load(f)\n",
175 |         "\n",
176 |         "# Load results\n",
177 |         "results = dict()\n",
178 |         "for path in result_path_list:\n",
179 |         "    with open(path, \"rb\") as f:\n",
180 |         "        results[path] = pickle.load(f)\n"
181 |       ],
182 |       "execution_count": 0,
183 |       "outputs": []
184 |     },
185 |     {
186 |       "cell_type": "markdown",
187 |       "metadata": {
188 |         "id": "kr3F1N-V5ZCi",
189 |         "colab_type": "text"
190 |       },
191 |       "source": [
192 |         "### Visualize Results\n"
193 |       ]
194 |     },
195 |     {
196 |       "cell_type": "markdown",
197 |       "metadata": {
198 |         "colab_type": "text",
199 |         "id": "UFdUihGlRO1A"
200 |       },
201 |       "source": [
202 |         "* Show the time needed to solve"
203 |       ]
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "metadata": {
208 |         "colab_type": "code",
209 |         "id": "sxGwuWQSRO1C",
210 |         "colab": {}
211 |       },
212 |       "source": [
213 |         "#@markdown ### Visualization Parameters\n",
214 |         "#@markdown |Name    |Description|\n",
215 |         "#@markdown |:---    |:---|\n",
216 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
217 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
218 |         "width = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
219 |         "height = 4 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
220 |       ],
221 |       "execution_count": 0,
222 |       "outputs": []
223 |     },
224 |     {
225 |       "cell_type": "code",
226 |       "metadata": {
227 |         "colab_type": "code",
228 |         "id": "6A5-TdZKRO1K",
229 |         "colab": {}
230 |       },
231 |       "source": [
232 |         "import matplotlib.pyplot as plt\n",
233 |         "\n",
234 |         "plt.figure(figsize=(width, height))\n",
235 |         "\n",
236 |         "required_times = dict()\n",
237 |         "for path in results.keys():\n",
238 |         "    t = [r.time_seconds for r in results[path].values() if r.is_solved]\n",
239 |         "    t.sort()\n",
240 |         "    required_times[path] = t\n",
241 |         "\n",
242 |         "plt.axis(\"tight\")\n",
243 |         "plt.axis(\"off\")\n",
244 |         "plt.title(\"Search Speed\")\n",
245 |         "\n",
246 |         "rowLabels = list(results.keys())\n",
247 |         "colLabels = [\"20%\", \"40%\", \"60%\", \"80%\", \"100%\"]\n",
248 |         "\n",
249 |         "text = []\n",
250 |         "for path in rowLabels:\n",
251 |         "    r = 0.0\n",
252 |         "    t = required_times[path]\n",
253 |         "    row = []\n",
254 |         "    for i in range(5):\n",
255 |         "        r += 0.2\n",
256 |         "        num = int(len(valid_dataset.dataset) * r)\n",
257 |         "        row.append(\"{} sec\".format(t[num]) if len(t) > num else \"-\")\n",
258 |         "    text.append(row)\n",
259 |         "\n",
260 |         "plt.table(cellText=text, colLabels=colLabels,\n",
261 |         "               rowLabels=rowLabels, loc=\"center\")"
262 |       ],
263 |       "execution_count": 0,
264 |       "outputs": []
265 |     },
266 |     {
267 |       "cell_type": "markdown",
268 |       "metadata": {
269 |         "id": "95XYnQQlNtMH",
270 |         "colab_type": "text"
271 |       },
272 |       "source": [
273 |         "* Show the nodes exploration needed to solve"
274 |       ]
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "metadata": {
279 |         "id": "q6Ik8J8MAcbv",
280 |         "colab_type": "code",
281 |         "colab": {}
282 |       },
283 |       "source": [
284 |         "#@markdown ### Visualization Parameters\n",
285 |         "#@markdown |Name    |Description|\n",
286 |         "#@markdown |:---    |:---|\n",
287 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
288 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
289 |         "width = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
290 |         "height = 4 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
291 |       ],
292 |       "execution_count": 0,
293 |       "outputs": []
294 |     },
295 |     {
296 |       "cell_type": "code",
297 |       "metadata": {
298 |         "id": "TdtM61G15xJx",
299 |         "colab_type": "code",
300 |         "colab": {}
301 |       },
302 |       "source": [
303 |         "import matplotlib.pyplot as plt\n",
304 |         "\n",
305 |         "plt.figure(figsize=(width, height))\n",
306 |         "\n",
307 |         "required_nodes = dict()\n",
308 |         "for path in results.keys():\n",
309 |         "    n = [r.explored_nodes for r in results[path].values() if r.is_solved]\n",
310 |         "    n.sort()\n",
311 |         "    required_nodes[path] = n\n",
312 |         "\n",
313 |         "plt.axis(\"tight\")\n",
314 |         "plt.axis(\"off\")\n",
315 |         "plt.title(\"Explored Nodes\")\n",
316 |         "\n",
317 |         "rowLabels = list(results.keys())\n",
318 |         "colLabels = [\"20%\", \"40%\", \"60%\", \"80%\", \"100%\"]\n",
319 |         "\n",
320 |         "text = []\n",
321 |         "for path in rowLabels:\n",
322 |         "    r = 0.0\n",
323 |         "    t = required_nodes[path]\n",
324 |         "    row = []\n",
325 |         "    for i in range(5):\n",
326 |         "        r += 0.2\n",
327 |         "        num = int(len(valid_dataset.dataset) * r)\n",
328 |         "        row.append(\"{}\".format(t[num]) if len(t) > num else \"-\")\n",
329 |         "    text.append(row)\n",
330 |         "\n",
331 |         "plt.table(cellText=text, colLabels=colLabels,\n",
332 |         "               rowLabels=rowLabels, loc=\"center\")"
333 |       ],
334 |       "execution_count": 0,
335 |       "outputs": []
336 |     },
337 |     {
338 |       "cell_type": "markdown",
339 |       "metadata": {
340 |         "id": "dGDkEoWPSH4k",
341 |         "colab_type": "text"
342 |       },
343 |       "source": [
344 |         "* Show the detail of the specified entry"
345 |       ]
346 |     },
347 |     {
348 |       "cell_type": "code",
349 |       "metadata": {
350 |         "colab_type": "code",
351 |         "id": "Zw7fHLT9SGcE",
352 |         "colab": {}
353 |       },
354 |       "source": [
355 |         "#@markdown ### Visualization Parameters\n",
356 |         "#@markdown |Name    |Description|\n",
357 |         "#@markdown |:---    |:---|\n",
358 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
359 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
360 |         "#@markdown |`index`  |The index of the specified entry|\n",
361 |         "width = 36 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
362 |         "height = 4 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
363 |         "index = 0 #@param {type: \"number\"}\n"
364 |       ],
365 |       "execution_count": 0,
366 |       "outputs": []
367 |     },
368 |     {
369 |       "cell_type": "code",
370 |       "metadata": {
371 |         "id": "uuZSF82T9zCg",
372 |         "colab_type": "code",
373 |         "colab": {}
374 |       },
375 |       "source": [
376 |         "from matplotlib import colors\n",
377 |         "import matplotlib.cm as cm\n",
378 |         "import matplotlib.pyplot as plt\n",
379 |         "\n",
380 |         "m = cm.ScalarMappable(norm=colors.Normalize(vmin=0, vmax=1), cmap=cm.Greens)\n",
381 |         "\n",
382 |         "plt.figure(figsize=(width, height))\n",
383 |         "    \n",
384 |         "entry = valid_dataset.dataset[index][0]\n",
385 |         "print(\"Valid {}\".format(index))\n",
386 |         "print(\"Examples\")\n",
387 |         "for i, example in enumerate(entry.examples):\n",
388 |         "    print(\"Example {}\".format(i))\n",
389 |         "    for j, input in enumerate(example.inputs):\n",
390 |         "        print(\"  input {}: {}\".format(j, input))\n",
391 |         "    print(\"  output: {}\".format(example.output))\n",
392 |         "print()\n",
393 |         "\n",
394 |         "print(\"Source Code\")\n",
395 |         "print(\"Ground Truth\")\n",
396 |         "print(entry.source_code)\n",
397 |         "print()\n",
398 |         "for path, result in results.items():\n",
399 |         "    print(path)\n",
400 |         "    print(result[index].solution)\n",
401 |         "    print()\n",
402 |         "\n",
403 |         "plt.title(\"Attributes\")\n",
404 |         "labels = [\"Ground Truth\"]\n",
405 |         "ticks = [0.5]\n",
406 |         "data = np.ones(len(entry.attribute))\n",
407 |         "colors = []\n",
408 |         "for _, v in entry.attribute.items():\n",
409 |         "    colors.append(m.to_rgba(1 if v else 0))\n",
410 |         "xs = np.arange(len(entry.attribute))\n",
411 |         "plt.bar(xs, data, width=0.9, bottom=np.zeros(1),\n",
412 |         "        color=colors, tick_label=list(entry.attribute.keys()))\n",
413 |         "\n",
414 |         "for i, (path, result) in enumerate(results.items()):\n",
415 |         "    labels.append(path)\n",
416 |         "    ticks.append(i + 1.5)\n",
417 |         "    colors = []\n",
418 |         "    for key in entry.attribute.keys():\n",
419 |         "        colors.append(m.to_rgba(result[index].probabilities[key]))\n",
420 |         "    plt.bar(xs, data, width=0.9, bottom=np.ones(1) * (i + 1), color=colors,\n",
421 |         "            tick_label=list(entry.attribute.keys()))\n",
422 |         "\n",
423 |         "plt.yticks(ticks=ticks, labels=labels)\n"
424 |       ],
425 |       "execution_count": 0,
426 |       "outputs": []
427 |     }
428 |   ]
429 | }


--------------------------------------------------------------------------------
/inspect_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "deep-coder inspect-dataset",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "try:\n",
 39 |         "  from google.colab import drive\n",
 40 |         "  drive.mount('/gdrive')\n",
 41 |         "  runtime = \"host\"\n",
 42 |         "except:\n",
 43 |         "  runtime = \"local\""
 44 |       ],
 45 |       "execution_count": 0,
 46 |       "outputs": []
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "_S457sT6QMUr",
 52 |         "colab_type": "text"
 53 |       },
 54 |       "source": [
 55 |         "### Parameters"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "metadata": {
 61 |         "id": "2LYvG4iCQUwh",
 62 |         "colab_type": "code",
 63 |         "colab": {}
 64 |       },
 65 |       "source": [
 66 |         "#@title Parameters\n",
 67 |         "#@markdown |Name            |Description|\n",
 68 |         "#@markdown |:---            |:---|\n",
 69 |         "#@markdown |`seed`|The random seed|\n",
 70 |         "seed = 3984 #@param {type: \"number\"}\n",
 71 |         "\n",
 72 |         "#@markdown ### `deep-coder` Repositories\n",
 73 |         "#@markdown |Name            |Description|\n",
 74 |         "#@markdown |:---            |:---|\n",
 75 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 76 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 77 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 78 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 79 |         "\n",
 80 |         "#@markdown ### Dataset Filepathes\n",
 81 |         "#@markdown |Name                |Description|\n",
 82 |         "#@markdown |:---                |:---|\n",
 83 |         "#@markdown |`train_dataset_path`|The file path of the training dataset.|\n",
 84 |         "#@markdown |`valid_dataset_path`|The file path of the validation dataset.|\n",
 85 |         "train_dataset_path = \"dataset/train.pickle\" #@param {type: \"string\"}\n",
 86 |         "valid_dataset_path = \"dataset/valid.pickle\" #@param {type: \"string\"}\n",
 87 |         "\n"
 88 |       ],
 89 |       "execution_count": 0,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "cell_type": "markdown",
 94 |       "metadata": {
 95 |         "id": "_BembldCdOO1",
 96 |         "colab_type": "text"
 97 |       },
 98 |       "source": [
 99 |         "### Setup\n",
100 |         "* Fix the random seed\n",
101 |         "* Download the codebase (when using the host runtime)\n",
102 |         "  1. Clone git repository and move to the specified branch\n",
103 |         "  2. Initialize submodule\n",
104 |         "  3. Build the `search` tool\n",
105 |         "  4. Install chainer and cupy\n",
106 |         "* Define common functions"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "metadata": {
112 |         "id": "GwjlAkY1fR5j",
113 |         "colab_type": "code",
114 |         "colab": {}
115 |       },
116 |       "source": [
117 |         "import numpy as np\n",
118 |         "import random\n",
119 |         "\n",
120 |         "SEED_MAX = 2**32 - 1\n",
121 |         "\n",
122 |         "root_rng = np.random.RandomState(seed)\n",
123 |         "random.seed(root_rng.randint(SEED_MAX))\n",
124 |         "np.random.seed(root_rng.randint(SEED_MAX))"
125 |       ],
126 |       "execution_count": 0,
127 |       "outputs": []
128 |     },
129 |     {
130 |       "cell_type": "code",
131 |       "metadata": {
132 |         "id": "FIZJmuz8QFn_",
133 |         "colab_type": "code",
134 |         "colab": {}
135 |       },
136 |       "source": [
137 |         "if runtime == \"host\":\n",
138 |         "  %cd /content\n",
139 |         "  !rm -rf deep-coder\n",
140 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
141 |         "  %cd deep-coder\n",
142 |         "  !git checkout origin/$branch_name\n",
143 |         "  !git submodule init\n",
144 |         "  !git submodule update\n",
145 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
146 |         "  !curl https://colab.chainer.org/install | sh -\n",
147 |         "  !pip install tqdm"
148 |       ],
149 |       "execution_count": 0,
150 |       "outputs": []
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "metadata": {
155 |         "id": "kiJYKCy87zwb",
156 |         "colab_type": "code",
157 |         "colab": {}
158 |       },
159 |       "source": [
160 |         "def is_input(line):\n",
161 |         "    return 1 if (\"<- int\" in line or \"<- [int]\" in line) else 0"
162 |       ],
163 |       "execution_count": 0,
164 |       "outputs": []
165 |     },
166 |     {
167 |       "cell_type": "markdown",
168 |       "metadata": {
169 |         "id": "Oz7sdzxUi70b",
170 |         "colab_type": "text"
171 |       },
172 |       "source": [
173 |         "### Load Datasets\n",
174 |         "* Load dataset"
175 |       ]
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "metadata": {
180 |         "id": "h7kdglcUjDTQ",
181 |         "colab_type": "code",
182 |         "colab": {}
183 |       },
184 |       "source": [
185 |         "import pickle\n",
186 |         "\n",
187 |         "with open(train_dataset_path, \"rb\") as f:\n",
188 |         "  train_dataset = pickle.load(f)\n",
189 |         "\n",
190 |         "with open(valid_dataset_path, \"rb\") as f:\n",
191 |         "  valid_dataset = pickle.load(f)"
192 |       ],
193 |       "execution_count": 0,
194 |       "outputs": []
195 |     },
196 |     {
197 |       "cell_type": "markdown",
198 |       "metadata": {
199 |         "id": "kr3F1N-V5ZCi",
200 |         "colab_type": "text"
201 |       },
202 |       "source": [
203 |         "### Visualize Datasets\n",
204 |         "* Show the graph of #inputs and #entries in the dataset."
205 |       ]
206 |     },
207 |     {
208 |       "cell_type": "code",
209 |       "metadata": {
210 |         "id": "q6Ik8J8MAcbv",
211 |         "colab_type": "code",
212 |         "colab": {}
213 |       },
214 |       "source": [
215 |         "#@markdown ### Visualization Parameters\n",
216 |         "#@markdown |Name    |Description|\n",
217 |         "#@markdown |:---    |:---|\n",
218 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
219 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
220 |         "#@markdown ---\n",
221 |         "width = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
222 |         "height = 4 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
223 |       ],
224 |       "execution_count": 0,
225 |       "outputs": []
226 |     },
227 |     {
228 |       "cell_type": "code",
229 |       "metadata": {
230 |         "id": "TdtM61G15xJx",
231 |         "colab_type": "code",
232 |         "colab": {}
233 |       },
234 |       "source": [
235 |         "import matplotlib.pyplot as plt\n",
236 |         "\n",
237 |         "train = [sum(map(is_input, entry.source_code.split(\"\\n\"))) for entry, in train_dataset.dataset]\n",
238 |         "valid = [sum(map(is_input, entry.source_code.split(\"\\n\"))) for entry, in valid_dataset.dataset]\n",
239 |         "\n",
240 |         "plt.figure(figsize=(width, height))\n",
241 |         "plt.hist(train)\n",
242 |         "plt.hist(valid)\n",
243 |         "plt.legend([\"training\", \"validation\"])\n",
244 |         "plt.xlabel(\"#Inputs\")\n",
245 |         "plt.ylabel(\"#Entries\")"
246 |       ],
247 |       "execution_count": 0,
248 |       "outputs": []
249 |     },
250 |     {
251 |       "cell_type": "markdown",
252 |       "metadata": {
253 |         "id": "z40AzKb079Fd",
254 |         "colab_type": "text"
255 |       },
256 |       "source": [
257 |         "* Show the graph of body-length and #entries in the dataset.\n"
258 |       ]
259 |     },
260 |     {
261 |       "cell_type": "code",
262 |       "metadata": {
263 |         "colab_type": "code",
264 |         "id": "ZYzVa_imA1tN",
265 |         "colab": {}
266 |       },
267 |       "source": [
268 |         "#@markdown ### Visualization Parameters\n",
269 |         "#@markdown |Name    |Description|\n",
270 |         "#@markdown |:---    |:---|\n",
271 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
272 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
273 |         "#@markdown ---\n",
274 |         "width = 12 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
275 |         "height = 4 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
276 |       ],
277 |       "execution_count": 0,
278 |       "outputs": []
279 |     },
280 |     {
281 |       "cell_type": "code",
282 |       "metadata": {
283 |         "id": "HSnoJgEP5xAT",
284 |         "colab_type": "code",
285 |         "colab": {}
286 |       },
287 |       "source": [
288 |         "import matplotlib.pyplot as plt\n",
289 |         "\n",
290 |         "train = [len(entry.source_code.split(\"\\n\")) - sum(map(is_input, entry.source_code.split(\"\\n\"))) for entry, in train_dataset.dataset]\n",
291 |         "valid = [len(entry.source_code.split(\"\\n\")) - sum(map(is_input, entry.source_code.split(\"\\n\"))) for entry, in valid_dataset.dataset]\n",
292 |         "\n",
293 |         "plt.figure(figsize=(width, height))\n",
294 |         "plt.hist(train)\n",
295 |         "plt.hist(valid)\n",
296 |         "plt.legend([\"training\", \"validation\"])\n",
297 |         "plt.xlabel(\"length of body\")\n",
298 |         "plt.ylabel(\"#Entries\")"
299 |       ],
300 |       "execution_count": 0,
301 |       "outputs": []
302 |     },
303 |     {
304 |       "cell_type": "markdown",
305 |       "metadata": {
306 |         "id": "eioRCab97_Cx",
307 |         "colab_type": "text"
308 |       },
309 |       "source": [
310 |         "* Show the prior distribution of the training dataset"
311 |       ]
312 |     },
313 |     {
314 |       "cell_type": "code",
315 |       "metadata": {
316 |         "colab_type": "code",
317 |         "id": "chx-5LtlA37C",
318 |         "colab": {}
319 |       },
320 |       "source": [
321 |         "#@markdown ### Visualization Parameters\n",
322 |         "#@markdown |Name    |Description|\n",
323 |         "#@markdown |:---    |:---|\n",
324 |         "#@markdown |`width` |The width of the matplotlib plot|\n",
325 |         "#@markdown |`height`|The height of the matplotlib plot|\n",
326 |         "#@markdown ---\n",
327 |         "width = 36 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
328 |         "height = 8 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n"
329 |       ],
330 |       "execution_count": 0,
331 |       "outputs": []
332 |     },
333 |     {
334 |       "cell_type": "code",
335 |       "metadata": {
336 |         "id": "IBxXAO9G8ABY",
337 |         "colab_type": "code",
338 |         "colab": {}
339 |       },
340 |       "source": [
341 |         "import matplotlib.pyplot as plt\n",
342 |         "from src.dataset import prior_distribution\n",
343 |         "\n",
344 |         "# prior-distribution\n",
345 |         "prior = prior_distribution(train_dataset.dataset)\n",
346 |         "columns = []\n",
347 |         "data = []\n",
348 |         "for symbol, prob in prior.items():\n",
349 |         "    columns.append(symbol)\n",
350 |         "    data.append(prob)\n",
351 |         "data = np.array(data)\n",
352 |         "\n",
353 |         "# Show plot\n",
354 |         "xs = np.arange(len(columns)) + 1\n",
355 |         "plt.figure(figsize=(width, height))\n",
356 |         "plt.bar(xs, data, width=0.4, bottom=np.zeros(1), tick_label=columns)\n",
357 |         "plt.ylabel(\"Probability\")\n",
358 |         "plt.xlabel(\"Symbol\")\n",
359 |         "plt.title(\"Prior Distribution\")\n"
360 |       ],
361 |       "execution_count": 0,
362 |       "outputs": []
363 |     },
364 |     {
365 |       "cell_type": "markdown",
366 |       "metadata": {
367 |         "id": "AaAfw0Y69yVZ",
368 |         "colab_type": "text"
369 |       },
370 |       "source": [
371 |         "* Show the detail of the specified entry"
372 |       ]
373 |     },
374 |     {
375 |       "cell_type": "code",
376 |       "metadata": {
377 |         "id": "c8FppYnoAum1",
378 |         "colab_type": "code",
379 |         "colab": {}
380 |       },
381 |       "source": [
382 |         "#@markdown ### Visualization Parameters\n",
383 |         "#@markdown |Name     |Description|\n",
384 |         "#@markdown |:---     |:---|\n",
385 |         "#@markdown |`width`  |The width of the matplotlib plot|\n",
386 |         "#@markdown |`height` |The height of the matplotlib plot|\n",
387 |         "#@markdown |`dataset`|The dataset contains the specified entry|\n",
388 |         "#@markdown |`index`  |The index of the specified entry|\n",
389 |         "#@markdown ---\n",
390 |         "width = 36 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
391 |         "height = 1 #@param {type: \"slider\", min: 1, max: 48, step: 1}\n",
392 |         "dataset = \"train\" #@param [\"train\", \"valid\"]\n",
393 |         "index = 0 #@param {type: \"number\"}\n"
394 |       ],
395 |       "execution_count": 0,
396 |       "outputs": []
397 |     },
398 |     {
399 |       "cell_type": "code",
400 |       "metadata": {
401 |         "id": "uuZSF82T9zCg",
402 |         "colab_type": "code",
403 |         "colab": {}
404 |       },
405 |       "source": [
406 |         "from matplotlib import colors\n",
407 |         "import matplotlib.cm as cm\n",
408 |         "import matplotlib.pyplot as plt\n",
409 |         "\n",
410 |         "m = cm.ScalarMappable(norm=colors.Normalize(vmin=0, vmax=1), cmap=cm.Greens)\n",
411 |         "\n",
412 |         "plt.figure(figsize=(width, height))\n",
413 |         "    \n",
414 |         "def show_entry(title, entry):\n",
415 |         "    print(title)\n",
416 |         "    print(\"Source Code\")\n",
417 |         "    print(entry.source_code)\n",
418 |         "\n",
419 |         "    print()\n",
420 |         "    print(\"Examples\")\n",
421 |         "    for i, example in enumerate(entry.examples):\n",
422 |         "        print(\"Example {}\".format(i))\n",
423 |         "        for j, input in enumerate(example.inputs):\n",
424 |         "            print(\"  input {}: {}\".format(j, input))\n",
425 |         "        print(\"  output: {}\".format(example.output))\n",
426 |         "\n",
427 |         "    plt.title(\"Attributes\")\n",
428 |         "    plt.gca().yaxis.set_visible(False)\n",
429 |         "    data = np.ones(len(entry.attribute))\n",
430 |         "    colors = []\n",
431 |         "    for _, v in entry.attribute.items():\n",
432 |         "        colors.append(m.to_rgba(1 if v else 0))\n",
433 |         "    xs = np.arange(len(entry.attribute)) + 10\n",
434 |         "    plt.bar(xs, data, width=0.9, bottom=np.zeros(1),\n",
435 |         "            color=colors, tick_label=list(entry.attribute.keys()))\n",
436 |         "\n",
437 |         "entry = (train_dataset if dataset == \"train\" else valid_dataset).dataset[index][0]\n",
438 |         "show_entry(\"{} {}\".format(dataset, index), entry)\n"
439 |       ],
440 |       "execution_count": 0,
441 |       "outputs": []
442 |     }
443 |   ]
444 | }


--------------------------------------------------------------------------------
/examples/medium/train_w0_0.25.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "train (length=3, w_0=0.25)",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "yje9hqtcUQ_f",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "### Initialization\n",
 26 |         "* Check whether the runtime is host or local.\n",
 27 |         "* Mount Google Drive when using the host runtime."
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "FwqGy_GyUQnw",
 34 |         "colab_type": "code",
 35 |         "outputId": "6cec82b6-ab85-4bc9-b1ec-800d3135528e",
 36 |         "colab": {
 37 |           "base_uri": "https://localhost:8080/",
 38 |           "height": 34
 39 |         }
 40 |       },
 41 |       "source": [
 42 |         "try:\n",
 43 |         "  from google.colab import drive\n",
 44 |         "  drive.mount('/gdrive')\n",
 45 |         "  runtime = \"host\"\n",
 46 |         "except:\n",
 47 |         "  runtime = \"local\""
 48 |       ],
 49 |       "execution_count": 1,
 50 |       "outputs": [
 51 |         {
 52 |           "output_type": "stream",
 53 |           "text": [
 54 |             "Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount(\"/gdrive\", force_remount=True).\n"
 55 |           ],
 56 |           "name": "stdout"
 57 |         }
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "markdown",
 62 |       "metadata": {
 63 |         "id": "_S457sT6QMUr",
 64 |         "colab_type": "text"
 65 |       },
 66 |       "source": [
 67 |         "### Parameters"
 68 |       ]
 69 |     },
 70 |     {
 71 |       "cell_type": "code",
 72 |       "metadata": {
 73 |         "colab_type": "code",
 74 |         "id": "QN-4eF51DNqt",
 75 |         "colab": {}
 76 |       },
 77 |       "source": [
 78 |         "#@title Parameters\n",
 79 |         "#@markdown |Name            |Description|\n",
 80 |         "#@markdown |:---            |:---|\n",
 81 |         "#@markdown |`seed`|The random seed|\n",
 82 |         "seed = 3984 #@param {type: \"number\"}\n",
 83 |         "\n",
 84 |         "#@markdown ### `deep-coder` Repositories\n",
 85 |         "#@markdown |Name            |Description|\n",
 86 |         "#@markdown |:---            |:---|\n",
 87 |         "#@markdown |`repository_url`|The URL of `deep-coder` git repository (enabled only in the host runtime)|\n",
 88 |         "#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|\n",
 89 |         "repository_url = \"https://github.com/HiroakiMikami/deep-coder\" #@param {type: \"string\"}\n",
 90 |         "branch_name = \"master\" #@param {type: \"string\"}\n",
 91 |         "\n",
 92 |         "#@markdown ### Model Parameters\n",
 93 |         "#@markdown |Name               |Description|\n",
 94 |         "#@markdown |:---               |:---|\n",
 95 |         "#@markdown |`n_embed`          |The dimension of integer embeddings|\n",
 96 |         "#@markdown |`n_units`          |The number of units in the hidden layers|\n",
 97 |         "#@markdown |`num_hidden_layers`|The number of the hidden layers|\n",
 98 |         "n_embed = 20 #@param {type: \"number\"}\n",
 99 |         "n_units = 256 #@param {type: \"number\"}\n",
100 |         "num_hidden_layers = 3 #@param {type: \"number\"}\n",
101 |         "\n",
102 |         "#@markdown ### Training Settings\n",
103 |         "#@markdown |Name                |Description|\n",
104 |         "#@markdown |:---                |:---|\n",
105 |         "#@markdown |`batch_size`        |The minibatch size|\n",
106 |         "#@markdown |`weight_label_false`|The weight for the loss value in the case of attribute=False. `-1` means that using the original loss function|\n",
107 |         "#@markdown |`num_epochs`        |The numer of epoch|\n",
108 |         "#@markdown |`ratio_test`        |The ratio of entries for testing|\n",
109 |         "#@markdown |`num_train`         |The number of entries used for training|\n",
110 |         "batch_size = 32 #@param {type: \"number\"}\n",
111 |         "weight_label_false = 0.25 #@param {type: \"number\"}\n",
112 |         "num_epochs = 10 #@param {type: \"number\"}\n",
113 |         "ratio_test = 0 #@param {type: \"number\"}\n",
114 |         "num_train = 0 #@param {type: \"number\"}\n",
115 |         "\n",
116 |         "#@markdown ### Validation Settings\n",
117 |         "#@markdown |Name                |Description|\n",
118 |         "#@markdown |:---                |:---|\n",
119 |         "#@markdown |`timeout_second`    ||\n",
120 |         "#@markdown |`max_program_length`|The maximum length of the program|\n",
121 |         "timeout_second = 1 #@param {type: \"number\"}\n",
122 |         "max_program_length = 3 #@param {type: \"number\"}\n",
123 |         "\n",
124 |         "#@markdown ### Other Settings\n",
125 |         "#@markdown |Name    |Description|\n",
126 |         "#@markdown |:---    |:---|\n",
127 |         "#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|\n",
128 |         "device = 0 #@param {type: \"number\"}\n",
129 |         "\n",
130 |         "#@markdown ### Filepath\n",
131 |         "#@markdown |Name                |Description|\n",
132 |         "#@markdown |:---                |:---|\n",
133 |         "#@markdown |`train_dataset_path`|The file path of the training dataset.|\n",
134 |         "#@markdown |`valid_dataset_path`|The file path of the validation dataset.|\n",
135 |         "#@markdown |`destination_path`  |The directory of the directory that will contain the training results.|\n",
136 |         "train_dataset_path = \"/gdrive/My Drive/DeepCoder/dataset/length_3/train.pickle\" #@param {type: \"string\"}\n",
137 |         "valid_dataset_path = \"/gdrive/My Drive/DeepCoder/dataset/length_3/valid.pickle\" #@param {type: \"string\"}\n",
138 |         "destination_path = \"/gdrive/My Drive/DeepCoder/out/length_3/w0_0.25\" #@param {type: \"string\"}\n",
139 |         "\n"
140 |       ],
141 |       "execution_count": 0,
142 |       "outputs": []
143 |     },
144 |     {
145 |       "cell_type": "markdown",
146 |       "metadata": {
147 |         "id": "_BembldCdOO1",
148 |         "colab_type": "text"
149 |       },
150 |       "source": [
151 |         "### Setup\n",
152 |         "* Fix the random seed\n",
153 |         "* Download the codebase\n",
154 |         "  1. Clone git repository and move to the specified branch\n",
155 |         "  2. Initialize submodule\n",
156 |         "  3. Install chainer and cupy\n"
157 |       ]
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "metadata": {
162 |         "id": "GwjlAkY1fR5j",
163 |         "colab_type": "code",
164 |         "colab": {}
165 |       },
166 |       "source": [
167 |         "import numpy as np\n",
168 |         "import random\n",
169 |         "\n",
170 |         "SEED_MAX = 2**32 - 1\n",
171 |         "\n",
172 |         "root_rng = np.random.RandomState(seed)\n",
173 |         "random.seed(root_rng.randint(SEED_MAX))\n",
174 |         "np.random.seed(root_rng.randint(SEED_MAX))"
175 |       ],
176 |       "execution_count": 0,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "code",
181 |       "metadata": {
182 |         "id": "FIZJmuz8QFn_",
183 |         "colab_type": "code",
184 |         "outputId": "e5487737-ab4c-4a7c-cca4-57570203580b",
185 |         "colab": {
186 |           "base_uri": "https://localhost:8080/",
187 |           "height": 833
188 |         }
189 |       },
190 |       "source": [
191 |         "if runtime == \"host\":\n",
192 |         "  %cd /content\n",
193 |         "  !rm -rf deep-coder\n",
194 |         "  ![ ! -e deep-coder ] && git clone $repository_url deep-coder\n",
195 |         "  %cd deep-coder\n",
196 |         "  !git checkout origin/$branch_name\n",
197 |         "  !git submodule init\n",
198 |         "  !git submodule update\n",
199 |         "  !make -C DeepCoder_Utils/enumerative-search -j `nproc`\n",
200 |         "  !curl https://colab.chainer.org/install | sh -\n",
201 |         "  !pip install tqdm"
202 |       ],
203 |       "execution_count": 4,
204 |       "outputs": [
205 |         {
206 |           "output_type": "stream",
207 |           "text": [
208 |             "/content\n",
209 |             "Cloning into 'deep-coder'...\n",
210 |             "remote: Enumerating objects: 139, done.\u001b[K\n",
211 |             "remote: Counting objects: 100% (139/139), done.\u001b[K\n",
212 |             "remote: Compressing objects: 100% (104/104), done.\u001b[K\n",
213 |             "remote: Total 1263 (delta 80), reused 64 (delta 34), pack-reused 1124\u001b[K\n",
214 |             "Receiving objects: 100% (1263/1263), 17.84 MiB | 14.27 MiB/s, done.\n",
215 |             "Resolving deltas: 100% (758/758), done.\n",
216 |             "/content/deep-coder\n",
217 |             "Note: checking out 'origin/master'.\n",
218 |             "\n",
219 |             "You are in 'detached HEAD' state. You can look around, make experimental\n",
220 |             "changes and commit them, and you can discard any commits you make in this\n",
221 |             "state without impacting any branches by performing another checkout.\n",
222 |             "\n",
223 |             "If you want to create a new branch to retain commits you create, you may\n",
224 |             "do so (now or later) by using -b with the checkout command again. Example:\n",
225 |             "\n",
226 |             "  git checkout -b <new-branch-name>\n",
227 |             "\n",
228 |             "HEAD is now at cbf6f06 Add example script to generate the dataset\n",
229 |             "Submodule 'DeepCoder_Utils' (https://github.com/HiroakiMikami/DeepCoder-Utils.git) registered for path 'DeepCoder_Utils'\n",
230 |             "Cloning into '/content/deep-coder/DeepCoder_Utils'...\n",
231 |             "Submodule path 'DeepCoder_Utils': checked out '10330caf96b2f6bf354c512010b356a7b0d1dba5'\n",
232 |             "make: Entering directory '/content/deep-coder/DeepCoder_Utils/enumerative-search'\n",
233 |             "g++ -std=c++11 -O3   successor.cc -c -o successor.o\n",
234 |             "g++ -std=c++11 -O3   ops.cc -c -o ops.o\n",
235 |             "g++ -std=c++11 -O3   program_state.cc -c -o program_state.o\n",
236 |             "g++ -std=c++11 -O3   main.cc -c -o main.o\n",
237 |             "g++ -std=c++11 -O3   datum.cc -c -o datum.o\n",
238 |             "g++ -std=c++11 -O3   depth_first_search.cc -c -o depth_first_search.o\n",
239 |             "g++ -std=c++11 -O3   utils.cc -c -o utils.o\n",
240 |             "g++ -std=c++11 -O3   io_set.cc -c -o io_set.o\n",
241 |             "g++ -std=c++11 -O3  successor.o ops.o program_state.o main.o datum.o depth_first_search.o utils.o io_set.o -o search\n",
242 |             "make: Leaving directory '/content/deep-coder/DeepCoder_Utils/enumerative-search'\n",
243 |             "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
244 |             "                                 Dload  Upload   Total   Spent    Left  Speed\n",
245 |             "100  1580  100  1580    0     0  10463      0 --:--:-- --:--:-- --:--:-- 10463\n",
246 |             "+ apt -y -q install cuda-libraries-dev-10-0\n",
247 |             "Reading package lists...\n",
248 |             "Building dependency tree...\n",
249 |             "Reading state information...\n",
250 |             "cuda-libraries-dev-10-0 is already the newest version (10.0.130-1).\n",
251 |             "0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.\n",
252 |             "+ pip install -q cupy-cuda100  chainer \n",
253 |             "+ set +ex\n",
254 |             "Installation succeeded!\n",
255 |             "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (4.28.1)\n"
256 |           ],
257 |           "name": "stdout"
258 |         }
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "markdown",
263 |       "metadata": {
264 |         "id": "Oz7sdzxUi70b",
265 |         "colab_type": "text"
266 |       },
267 |       "source": [
268 |         "### Train DNN Model\n",
269 |         "* Create `Trainer`\n",
270 |         "* Run training"
271 |       ]
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "metadata": {
276 |         "id": "h7kdglcUjDTQ",
277 |         "colab_type": "code",
278 |         "colab": {}
279 |       },
280 |       "source": [
281 |         "import pickle\n",
282 |         "import os\n",
283 |         "import chainer as ch\n",
284 |         "from chainer import datasets\n",
285 |         "from chainer.training import extensions\n",
286 |         "from src.dataset import EncodedDataset, Dataset\n",
287 |         "import src.train as T\n",
288 |         "from src.model import ModelShapeParameters\n",
289 |         "\n",
290 |         "with open(train_dataset_path, \"rb\") as f:\n",
291 |         "    d: Dataset = pickle.load(f)\n",
292 |         "dataset = d.dataset\n",
293 |         "metadata = d.metadata\n",
294 |         "    \n",
295 |         "\n",
296 |         "if num_train != 0:\n",
297 |         "    num_test = int(num_train *\n",
298 |         "                   (ratio_test if ratio_test is not None else 0.0))\n",
299 |         "    dataset, _ = datasets.split_dataset_random(\n",
300 |         "        dataset, num_train + num_test, seed=root_rng.randint(SEED_MAX))\n",
301 |         "\n",
302 |         "model_shape = ModelShapeParameters(metadata, num_hidden_layers, n_embed, n_units)\n",
303 |         "\n",
304 |         "n_entries = len(dataset)\n",
305 |         "dataset = EncodedDataset(Dataset(dataset, metadata))\n",
306 |         "if ratio_test is None or ratio_test == 0:\n",
307 |         "    train = dataset\n",
308 |         "    test = None\n",
309 |         "else:\n",
310 |         "    train, test = datasets.split_dataset_random(dataset, int(\n",
311 |         "        n_entries * (1.0 - ratio_test)), seed=root_rng.randint(SEED_MAX))\n",
312 |         "\n",
313 |         "train_iter = ch.iterators.SerialIterator(train, batch_size)\n",
314 |         "if test is not None:\n",
315 |         "    test_iter = ch.iterators.SerialIterator(\n",
316 |         "        test, batch_size, repeat=False, shuffle=False)\n",
317 |         "else:\n",
318 |         "    test_iter = None\n",
319 |         "\n",
320 |         "train = T.Training(train_iter, test_iter, destination_path, model_shape, weight_label_false,\n",
321 |         "                   num_epochs, device=device)\n",
322 |         "train.trainer.extend(extensions.LogReport())\n",
323 |         "if test_iter is not None:\n",
324 |         "    train.trainer.extend(extensions.PrintReport(\n",
325 |         "        ['epoch',\n",
326 |         "         'main/loss', 'validation/main/loss',\n",
327 |         "         'main/accuracy', 'main/accuracy_false', 'main/accuracy_true',\n",
328 |         "         'validation/main/accuracy', 'validation/main/accuracy_false', 'validation/main/accuracy_true',\n",
329 |         "         'elapsed_time']))\n",
330 |         "else:\n",
331 |         "    train.trainer.extend(extensions.PrintReport(\n",
332 |         "        ['epoch', 'main/loss', 'main/accuracy', 'main/accuracy_false', 'main/accuracy_true', 'elapsed_time']))\n"
333 |       ],
334 |       "execution_count": 0,
335 |       "outputs": []
336 |     },
337 |     {
338 |       "cell_type": "code",
339 |       "metadata": {
340 |         "id": "Pl4xN2N2kGfo",
341 |         "colab_type": "code",
342 |         "outputId": "551911a7-53bb-4050-d21b-cc0d6198bf28",
343 |         "colab": {
344 |           "base_uri": "https://localhost:8080/",
345 |           "height": 204
346 |         }
347 |       },
348 |       "source": [
349 |         "train.trainer.run()"
350 |       ],
351 |       "execution_count": 6,
352 |       "outputs": [
353 |         {
354 |           "output_type": "stream",
355 |           "text": [
356 |             "epoch       main/loss   main/accuracy  main/accuracy_false  main/accuracy_true  elapsed_time\n",
357 |             "\u001b[J1           0.34842     0.582146       0.523196             0.955804            19.7328       \n",
358 |             "\u001b[J2           0.298922    0.655387       0.606752             0.963585            39.0438       \n",
359 |             "\u001b[J3           0.276125    0.688846       0.645138             0.965862            58.6429       \n",
360 |             "\u001b[J4           0.256822    0.716636       0.676875             0.968595            78.1293       \n",
361 |             "\u001b[J5           0.2425      0.734074       0.696553             0.971824            97.7923       \n",
362 |             "\u001b[J6           0.230753    0.749231       0.713646             0.974687            117.432       \n",
363 |             "\u001b[J7           0.219334    0.763054       0.729179             0.977694            137.323       \n",
364 |             "\u001b[J8           0.208591    0.776069       0.743796             0.980534            156.764       \n",
365 |             "\u001b[J9           0.198056    0.789123       0.758456             0.983368            176.322       \n",
366 |             "\u001b[J10          0.188058    0.801361       0.772229             0.985918            195.787       \n"
367 |           ],
368 |           "name": "stdout"
369 |         }
370 |       ]
371 |     },
372 |     {
373 |       "cell_type": "markdown",
374 |       "metadata": {
375 |         "id": "1Sl37YHR_b6L",
376 |         "colab_type": "text"
377 |       },
378 |       "source": [
379 |         "### Save DNN Model"
380 |       ]
381 |     },
382 |     {
383 |       "cell_type": "code",
384 |       "metadata": {
385 |         "id": "D-WYlqxVkO5i",
386 |         "colab_type": "code",
387 |         "colab": {}
388 |       },
389 |       "source": [
390 |         "import os\n",
391 |         "import chainer as ch\n",
392 |         "\n",
393 |         "if not os.path.exists(destination_path):\n",
394 |         "    os.makedirs(destination_path)\n",
395 |         "\n",
396 |         "with open(os.path.join(destination_path, \"model-shape.pickle\"), \"wb\") as f:\n",
397 |         "    pickle.dump(model_shape, f)\n",
398 |         "\n",
399 |         "ch.serializers.save_npz(os.path.join(destination_path, \"model.npz\"), train.predictor)\n"
400 |       ],
401 |       "execution_count": 0,
402 |       "outputs": []
403 |     },
404 |     {
405 |       "cell_type": "markdown",
406 |       "metadata": {
407 |         "colab_type": "text",
408 |         "id": "4IOCX_PXG6sH"
409 |       },
410 |       "source": [
411 |         "### Validate DNN Model"
412 |       ]
413 |     },
414 |     {
415 |       "cell_type": "code",
416 |       "metadata": {
417 |         "colab_type": "code",
418 |         "id": "SsVdGBe4G6sJ",
419 |         "outputId": "f94e0edb-737c-4b7a-a84a-220a7bc833bb",
420 |         "colab": {
421 |           "base_uri": "https://localhost:8080/",
422 |           "height": 51
423 |         }
424 |       },
425 |       "source": [
426 |         "import pickle\n",
427 |         "import os\n",
428 |         "import chainer as ch\n",
429 |         "from chainer import datasets\n",
430 |         "from src.dataset import EncodedDataset, Dataset\n",
431 |         "import src.inference as I\n",
432 |         "from src.model import ModelShapeParameters\n",
433 |         "from tqdm import tqdm_notebook as tqdm\n",
434 |         "\n",
435 |         "model = I.InferenceModel(model_shape)\n",
436 |         "ch.serializers.load_npz(os.path.join(destination_path, \"model.npz\"), model.predictor)\n",
437 |         "\n",
438 |         "with open(valid_dataset_path, \"rb\") as f:\n",
439 |         "    dataset: Dataset = pickle.load(f)\n",
440 |         "\n",
441 |         "pred = I.predict_with_neural_network(model_shape, model)\n",
442 |         "\n",
443 |         "results = dict([])\n",
444 |         "num_succ = 0\n",
445 |         "for i, (entry,) in enumerate(tqdm(dataset.dataset)):\n",
446 |         "    result = I.search(\n",
447 |         "        os.path.join(os.getcwd(), \"DeepCoder_Utils\",\n",
448 |         "                     \"enumerative-search\", \"search\"),\n",
449 |         "        timeout_second,\n",
450 |         "        model_shape.dataset_metadata.value_range,\n",
451 |         "        entry.examples,\n",
452 |         "        max_program_length,\n",
453 |         "        pred\n",
454 |         "    )\n",
455 |         "    results[i] = result\n",
456 |         "    if result.is_solved:\n",
457 |         "        num_succ += 1\n",
458 |         "\n",
459 |         "print(\"Solved: {} of {} examples\".format(num_succ, len(dataset.dataset)))\n"
460 |       ],
461 |       "execution_count": 8,
462 |       "outputs": [
463 |         {
464 |           "output_type": "display_data",
465 |           "data": {
466 |             "application/vnd.jupyter.widget-view+json": {
467 |               "model_id": "be152486f1274241933e65d1eaf8cd45",
468 |               "version_minor": 0,
469 |               "version_major": 2
470 |             },
471 |             "text/plain": [
472 |               "HBox(children=(IntProgress(value=0, max=500), HTML(value='')))"
473 |             ]
474 |           },
475 |           "metadata": {
476 |             "tags": []
477 |           }
478 |         },
479 |         {
480 |           "output_type": "stream",
481 |           "text": [
482 |             "\n",
483 |             "Solved: 470 of 500 examples\n"
484 |           ],
485 |           "name": "stdout"
486 |         }
487 |       ]
488 |     },
489 |     {
490 |       "cell_type": "markdown",
491 |       "metadata": {
492 |         "colab_type": "text",
493 |         "id": "Lhq-S-vcGxUQ"
494 |       },
495 |       "source": [
496 |         "### Save Validation Result"
497 |       ]
498 |     },
499 |     {
500 |       "cell_type": "code",
501 |       "metadata": {
502 |         "id": "5DuxS691_fuT",
503 |         "colab_type": "code",
504 |         "colab": {}
505 |       },
506 |       "source": [
507 |         "import os\n",
508 |         "\n",
509 |         "if not os.path.exists(destination_path):\n",
510 |         "    os.makedirs(destination_path)\n",
511 |         "\n",
512 |         "with open(os.path.join(destination_path, \"result.pickle\"), \"wb\") as f:\n",
513 |         "    pickle.dump(results, f)"
514 |       ],
515 |       "execution_count": 0,
516 |       "outputs": []
517 |     }
518 |   ]
519 | }


--------------------------------------------------------------------------------