├── LICENSE ├── README.md ├── django ├── all.anno └── all.code └── src ├── extract_words.py └── parse.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2015 Yusuke Oda and AHCLab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Django Dataset for Code Translation Tasks 2 | ========================================= 3 | 4 | This repository contains the *Django* dataset used in the paper 5 | [*"Learning to Generate Pseudo-Code from Source Code Using Statistical Machine Translation"*](http://ieeexplore.ieee.org/document/7372045/), 6 | Oda et al., ASE, 2015. 7 | 8 | Contents 9 | -------- 10 | 11 | * Directory **django**: contains the raw data of the Django dataset. 12 | * Directory **src**: contains code to extract Python AST (with some modifications) and their leaf nodes. 13 | 14 | Citation 15 | -------- 16 | 17 | Please cite the following paper in your publication when you used this dataset: 18 | 19 | ```bibtex 20 | @inproceedings{oda2015ase:pseudogen1, 21 | author = {Oda, Yusuke and Fudaba, Hiroyuki and Neubig, Graham and Hata, Hideaki and Sakti, Sakriani and Toda, Tomoki and Nakamura, Satoshi}, 22 | title = {Learning to Generate Pseudo-code from Source Code Using Statistical Machine Translation}, 23 | booktitle = {Proceedings of the 2015 30th IEEE/ACM International Conference on Automated Software Engineering (ASE)}, 24 | series = {ASE '15}, 25 | month = {November}, 26 | year = {2015}, 27 | isbn = {978-1-5090-0025-8}, 28 | pages = {574--584}, 29 | numpages = {11}, 30 | url = {https://doi.org/10.1109/ASE.2015.36}, 31 | doi = {10.1109/ASE.2015.36}, 32 | acmid = {2916173}, 33 | publisher = {IEEE Computer Society}, 34 | address = {Lincoln, Nebraska, USA} 35 | } 36 | ``` 37 | -------------------------------------------------------------------------------- /src/extract_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # usage: 4 | # ./extract_words.py < ast > leaves 5 | 6 | import sys 7 | import re 8 | 9 | def main(): 10 | for i, l in enumerate(sys.stdin): 11 | l = re.sub(r'([()])', r' \1 ', l) 12 | tok_in = re.sub(r'\s+', r' ', l).strip().split() 13 | tok_out = [] 14 | for i in range(1, len(tok_in)): 15 | if tok_in[i-1] != '(' and tok_in[i] not in ['(', ')']: 16 | tok_out.append(tok_in[i]) 17 | print(' '.join(tok_out)); 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | -------------------------------------------------------------------------------- /src/parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # usage: 3 | # ./parse.py < python-code > ast 4 | 5 | import ast 6 | import sys 7 | import re 8 | import inspect 9 | 10 | def typename(x): 11 | return type(x).__name__ 12 | 13 | def escape(text): 14 | text = text \ 15 | .replace('"', '`') \ 16 | .replace('\'', '`') \ 17 | .replace(' ', '-SP-') \ 18 | .replace('\t', '-TAB-') \ 19 | .replace('\n', '-NL-') \ 20 | .replace('(', '-LRB-') \ 21 | .replace(')', '-RRB-') \ 22 | .replace('|', '-BAR-') 23 | return repr(text)[1:-1] if text else '-NONE-' 24 | 25 | def makestr(node): 26 | 27 | #if node is None or isinstance(node, ast.Pass): 28 | # return '' 29 | 30 | if isinstance(node, ast.AST): 31 | n = 0 32 | nodename = typename(node) 33 | s = '(' + nodename 34 | for chname, chval in ast.iter_fields(node): 35 | chstr = makestr(chval) 36 | if chstr: 37 | s += ' (' + chname + ' ' + chstr + ')' 38 | n += 1 39 | if not n: 40 | s += ' -' + nodename + '-' # (Foo) -> (Foo -Foo-) 41 | s += ')' 42 | return s 43 | 44 | elif isinstance(node, list): 45 | n = 0 46 | s = '(list' 47 | for ch in node: 48 | chstr = makestr(ch) 49 | if chstr: 50 | s += ' ' + chstr 51 | n += 1 52 | s += ')' 53 | return s if n else '' 54 | 55 | elif isinstance(node, str): 56 | return '(str ' + escape(node) + ')' 57 | 58 | elif isinstance(node, bytes): 59 | return '(bytes ' + escape(str(node)) + ')' 60 | 61 | else: 62 | return '(' + typename(node) + ' ' + str(node) + ')' 63 | 64 | def main(): 65 | p_elif = re.compile(r'^elif\s?') 66 | p_else = re.compile(r'^else\s?') 67 | p_try = re.compile(r'^try\s?') 68 | p_except = re.compile(r'^except\s?') 69 | p_finally = re.compile(r'^finally\s?') 70 | p_decorator = re.compile(r'^@.*') 71 | 72 | for l in sys.stdin: 73 | l = l.strip() 74 | if not l: 75 | print() 76 | sys.stdout.flush() 77 | continue 78 | 79 | if p_elif.match(l): l = 'if True: pass\n' + l 80 | if p_else.match(l): l = 'if True: pass\n' + l 81 | 82 | if p_try.match(l): l = l + 'pass\nexcept: pass' 83 | elif p_except.match(l): l = 'try: pass\n' + l 84 | elif p_finally.match(l): l = 'try: pass\n' + l 85 | 86 | if p_decorator.match(l): l = l + '\ndef dummy(): pass' 87 | if l[-1] == ':': l = l + 'pass' 88 | 89 | parse = ast.parse(l) 90 | parse = parse.body[0] 91 | dump = makestr(parse) 92 | print(dump) 93 | sys.stdout.flush() 94 | 95 | if __name__ == '__main__': 96 | main() 97 | 98 | --------------------------------------------------------------------------------