├── .gitignore
├── LICENSE
├── README.md
├── code_diff
    ├── __init__.py
    ├── ast.py
    ├── diff_utils.py
    ├── gumtree
    │   ├── __init__.py
    │   ├── chawathe.py
    │   ├── editmap.py
    │   ├── isomap.py
    │   ├── ops.py
    │   └── utils.py
    ├── sstubs.py
    └── utils.py
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    └── test_sstubs.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | #VSCode
132 | .vscode/
133 | scripts/
134 | 
135 | .DS_Store
136 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2021-2022 Cedric Richter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Code Diff
  2 | ------------------------------------------------
  3 | > Fast AST based code differencing in Python
  4 | 
  5 | Software projects are constantly evolving to integrate new features or improve existing implementations. To keep track of this progress, it becomes important to track individual code changes. Code differencing provides a way
  6 | to identify the smallest code change between two
  7 | implementations. 
  8 | 
  9 | **code.diff** provides a fast alternative to standard code differencing techniques with a focus
 10 | on AST based code differencing. As part of this library, we include a fast reimplementation of the [**GumTree**](https://github.com/GumTreeDiff/gumtree) algorithm. However, by relying on
 11 | a best-effort AST parser, we are able to generate
 12 | AST code changes for individual code snippets. Many
 13 | programming languages including Python, Java and JavaScript are supported!
 14 | 
 15 | 
 16 | ## Installation
 17 | The package is tested under Python 3. It can be installed via:
 18 | ```
 19 | pip install code-diff
 20 | ```
 21 | 
 22 | ## Usage
 23 | code.diff can compute a code difference for nearly any program code in a few lines of code:
 24 | ```python
 25 | import code_diff as cd
 26 | 
 27 | # Python
 28 | output = cd.difference(
 29 |     '''
 30 |         def my_func():
 31 |             print("Hello World")
 32 |     ''',
 33 |     '''
 34 |         def say_helloworld():
 35 |             print("Hello World")
 36 |     ''',
 37 | lang = "python")
 38 | 
 39 | # Output: my_func -> say_helloworld
 40 | 
 41 | output.edit_script()
 42 | 
 43 | # Output: 
 44 | # [
 45 | #  Update((identifier:my_func, line 1:12 - 1:19), say_helloworld)
 46 | #]
 47 | 
 48 | 
 49 | # Java
 50 | output = cd.difference(
 51 |     '''
 52 |         int x = x + 1;
 53 |     ''',
 54 |     '''
 55 |         int x = x / 2;
 56 |     ''',
 57 | lang = "java")
 58 | 
 59 | # Output: x + 1 -> x / 2
 60 | 
 61 | output.edit_script()
 62 | 
 63 | # Output: [
 64 | #  Insert(/:/, (binary_operator, line 0:4 - 0:9), 1),
 65 | #  Update((integer:1, line 0:8 - 0:9), 2),
 66 | #  Delete((+:+, line 0:6 - 0:7))
 67 | #]
 68 | 
 69 | 
 70 | ```
 71 | ## Language support
 72 | code.diff supports most programming languages
 73 | where an AST can be computed. To parse an AST,
 74 | the underlying parser employs
 75 | * [**code.tokenize:**](https://github.com/cedricrupb/code_tokenize) A frontend for 
 76 | tree-sitter to effectively parse and tokenize 
 77 | program code in Python.
 78 | 
 79 | * [**tree-sitter:**](https://tree-sitter.github.io/tree-sitter/) A best-effort AST parser supporting
 80 | many programming languages including Python, Java and JavaScript.
 81 | 
 82 | To decide whether your code can be handled by code.diff please review the libraries above.
 83 | 
 84 | **GumTree:** To compute an edit script between a source and target AST, we employ a Python reimplementation of the [GumTree](https://github.com/GumTreeDiff/gumtree) algorithm. Note however that the computed script are heavily dependent on the AST representation of the given code. Therefore, AST edit script computed with code.diff might significantly differ to the one computed by GumTree.
 85 | 
 86 | 
 87 | ## Release history
 88 | * 0.1.2
 89 |     * Fix of the release information
 90 |     * Fix bug in 0.1.1 release
 91 |     * Package now useable by installing from PyPI
 92 | * 0.1.0
 93 |     * Initial functionality
 94 |     * Documentation
 95 |     * SStuB Testing
 96 | 
 97 | ## Project Info
 98 | The goal of this project is to provide developer with easy access to AST-based code differencing. This is currently developed as a helper library for internal research projects. Therefore, it will only be updated as needed.
 99 | 
100 | Feel free to open an issue if anything unexpected
101 | happens. 
102 | 
103 | [Cedric Richter](https://uol.de/informatik/formale-methoden/team/cedric-richter) - [@cedricrichter](https://twitter.com/cedrichter) - cedric.richter@uni-oldenburg.de
104 | 
105 | Distributed under the MIT license. See ``LICENSE`` for more information.
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/code_diff/__init__.py:
--------------------------------------------------------------------------------
  1 | from code_tokenize.lang import load_from_lang_config
  2 | from code_tokenize.tokens import match_type
  3 | 
  4 | from .ast     import parse_ast
  5 | from .utils   import cached_property
  6 | from .sstubs  import SStubPattern, classify_sstub
  7 | from .gumtree import compute_edit_script, EditScript, Update
  8 | 
  9 | 
 10 | # Main method --------------------------------------------------------
 11 | 
 12 | def difference(source, target, lang = "guess", **kwargs):
 13 |     """
 14 |     Computes the smallest difference between source and target
 15 | 
 16 |     Computes the smallest code difference between the given 
 17 |     code snippets. Difference is computed by a simulteanous
 18 |     walk over the ASTs of the given code snippets. Returned
 19 |     will be the smallest code snippet that represent
 20 |     the first AST node found to be different.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     source : str
 25 |         Source code which should be compared
 26 |     
 27 |     target : str
 28 |         Comparison target as a code string
 29 | 
 30 |     lang : [python, java, javascript, ...]
 31 |         Programming language which should be used
 32 |         to parse the code snippets.
 33 |         Default: guess (Currently not supported, will throw error)
 34 |     
 35 |     syntax_error : [raise, warn, ignore]
 36 |         Strategy to handle syntax errors in code.
 37 |         To parse incomplete code snippets, 'ignore' should
 38 |         be selected to silent any warning.
 39 |         Default: raise (Raises an exception)
 40 | 
 41 |     **kwargs : dict
 42 |         Further config option that are specific to
 43 |         the underlying AST parser. See code_tokenize
 44 |         for more infos.
 45 | 
 46 |     Returns
 47 |     -------
 48 |     ASTDiff
 49 |         The smallest code change necessary
 50 |         to convert the source code into the target code.
 51 |     
 52 |     """
 53 |     
 54 |     config     = load_from_lang_config(lang, **kwargs)
 55 |     source_ast = parse_ast(source, lang = lang, **kwargs)
 56 |     target_ast = parse_ast(target, lang = lang, **kwargs)
 57 | 
 58 |     if source_ast is None or target_ast is None:
 59 |         raise ValueError("Source / Target AST seems to be empty: %s" % source)
 60 | 
 61 |     # Concretize Diff
 62 |     source_ast, target_ast = diff_search(source_ast, target_ast)
 63 | 
 64 |     if source_ast is None:
 65 |         raise ValueError("Source and Target AST are identical.")
 66 | 
 67 |     return ASTDiff(config, source_ast, target_ast)
 68 | 
 69 | 
 70 | # Diff Search --------------------------------------------------------
 71 | # Run BFS until we find a node with at least two diffs
 72 | 
 73 | def diff_search(source_ast, target_ast):
 74 |     if source_ast is None or source_ast.isomorph(target_ast): return None, None
 75 | 
 76 |     queue = [(source_ast, target_ast)]
 77 |     while len(queue) > 0:
 78 |         source_node, target_node = queue.pop(0)
 79 | 
 80 |         if len(source_node.children) != len(target_node.children):
 81 |             return (source_node, target_node)
 82 |         
 83 |         next_children = []
 84 |         for i, source_child in enumerate(source_node.children):
 85 |             target_child = target_node.children[i]
 86 | 
 87 |             if not source_child.isomorph(target_child): 
 88 |                 next_children.append((source_child, target_child))
 89 |         
 90 |         if len(next_children) == 1:
 91 |             queue.append(next_children[0])
 92 |         else:
 93 |             return (source_node, target_node)
 94 | 
 95 | 
 96 | # AST Difference --------------------------------------------------------
 97 | 
 98 | class ASTDiff:
 99 |     """
100 |     Difference between two code snippets
101 | 
102 |     This object represents the smallest code change
103 |     necessary to transform a source code snippet
104 |     into a target code.
105 | 
106 |     Attributes
107 |     ----------
108 |     is_single_statement : bool
109 |         Whether the code difference only affect a single program statement
110 | 
111 |     source_ast : ASTNode
112 |         AST node related to the code change
113 |     
114 |     source_text : str
115 |         Source code which have to be changed
116 | 
117 |     target_ast : ASTNode
118 |         AST node which is different to the source AST
119 | 
120 |     target_text : str
121 |         Target text for converting source to target
122 |     
123 |     Methods
124 |     -------
125 |     edit_script : list[EditOp]
126 |         Computes a sequence of AST operations which need
127 |         to be performed to translate source code in target code
128 |         
129 |         Note: We balance performance and precision by computing
130 |         the AST edit script at the current diff level. The
131 |         algorithm runs the fastest on the smallest diff level
132 |         but is also most imprecise. To achieve the highest precision,
133 |         the root_diff should be used.
134 | 
135 |     sstub_pattern : SStuBPattern
136 |         Categorizes the current diff into one of 20 SStuB categories.
137 |         Note: Currently, this operation is only supported for
138 |         Python code. Running the function on code in another language
139 |         will cause an exception.
140 | 
141 |     statement_diff : ASTDiff
142 |         raises the AST difference to the statement level
143 |     
144 |     root_diff : ASTDiff
145 |         raises the AST difference to the root level (of each code snippet)
146 | 
147 |     
148 |     """
149 | 
150 |     def __init__(self, config, source_ast, target_ast):
151 |         self.config     = config
152 |         self.source_ast = source_ast
153 |         self.target_ast = target_ast
154 | 
155 |     @cached_property
156 |     def is_single_statement(self):
157 |         return (is_single_statement(self.config.statement_types, self.source_ast)
158 |                     and is_single_statement(self.config.statement_types, self.target_ast))
159 | 
160 |     @cached_property
161 |     def source_text(self):
162 |         return tokenize_tree(self.source_ast)
163 | 
164 |     @cached_property
165 |     def target_text(self):
166 |         return tokenize_tree(self.target_ast)
167 | 
168 |     def statement_diff(self):
169 |         source_stmt = parent_statement(self.config.statement_types, self.source_ast)
170 |         target_stmt = parent_statement(self.config.statement_types, self.target_ast)
171 | 
172 |         if source_stmt is None or target_stmt is None: 
173 |             raise ValueError("AST diff is not enclosed in a statement")
174 |         
175 |         return ASTDiff(self.config, source_stmt, target_stmt)
176 | 
177 |     def root_diff(self):
178 |         return ASTDiff(self.config, ast_root(self.source_ast), ast_root(self.target_ast))
179 | 
180 |     def sstub_pattern(self):
181 |         if self.config.lang != "python":
182 |             raise ValueError("SStuB can currently only be computed for Python code.")
183 |         
184 |         if (parent_statement(self.config.statement_types, self.source_ast) is None
185 |                 or parent_statement(self.config.statement_types, self.target_ast) is None):
186 |             return SStubPattern.NO_STMT                
187 | 
188 |         if not self.is_single_statement:
189 |             return SStubPattern.MULTI_STMT
190 |         
191 |         return classify_sstub(*diff_search(self.source_ast, self.target_ast))
192 | 
193 |     def edit_script(self):
194 | 
195 |         source_ast, target_ast = self.source_ast, self.target_ast
196 | 
197 |         if source_ast.type == target_ast.type and len(source_ast.children) == 0 and len(target_ast.children) == 0:
198 |             # Both nodes are tokens of the same type 
199 |             # Only an update is required
200 |             return EditScript([Update(source_ast, target_ast.text)])
201 | 
202 |         # We need a common root to add to
203 |         while source_ast.type != target_ast.type: 
204 |             if source_ast.parent is None: break
205 |             if target_ast.parent is None: break
206 | 
207 |             source_ast = source_ast.parent
208 |             target_ast = target_ast.parent
209 | 
210 |         return compute_edit_script(source_ast, target_ast)
211 | 
212 |     def __repr__(self):
213 |         return "%s -> %s" % (self.source_text, self.target_text)
214 | 
215 |     
216 | 
217 | 
218 | # AST Utils -----------------------------------------------------------
219 | 
220 | def is_single_statement(statement_types, ast):
221 | 
222 |     if parent_statement(statement_types, ast) is None: return False
223 |         
224 |     def is_statement_type(node_type):
225 |         return any(match_type(r, node_type) for r in statement_types)
226 | 
227 |     # Test if any other statement as child
228 |     queue = list(ast.children)
229 |     while len(queue) > 0:
230 |         node = queue.pop(0)
231 |         if is_statement_type(node.type): return False
232 | 
233 |         queue.extend(node.children)
234 |     
235 |     return True
236 | 
237 | 
238 | def parent_statement(statement_types, ast):
239 |     
240 |     def is_statement_type(node_type):
241 |         return any(match_type(r, node_type) for r in statement_types)
242 | 
243 |     # Test if node in statement
244 |     parent_node = ast
245 |     while parent_node is not None and not is_statement_type(parent_node.type):
246 |         parent_node = parent_node.parent
247 |     
248 |     return parent_node
249 | 
250 | 
251 | def ast_root(ast):
252 |     parent_node = ast
253 | 
254 |     while parent_node.parent is not None:
255 |         parent_node = parent_node.parent
256 | 
257 |     return parent_node
258 | 
259 | 
260 | def tokenize_tree(ast):
261 |     tokens = []
262 | 
263 |     # Test if any other statement as child
264 |     if ast.text: tokens.append(ast.text)
265 | 
266 |     for child in ast.children:
267 |         tokens.append(tokenize_tree(child))
268 |     
269 |     return " ".join(tokens)
270 | 
271 | 
272 | 
273 | def is_compatible_root(root_candidate, source_ast):
274 |     return not equal_text(source_ast, root_candidate) and root_candidate.type != "block"
275 | 
276 | 
277 | def equal_text(source_ast, parent_ast):
278 |     source_position = source_ast.position
279 |     parent_position = parent_ast.position
280 | 
281 |     if parent_position[0][0] < source_position[0][0]: return False
282 |     if source_position[1][0] < parent_position[1][0]: return False
283 | 
284 |     return (source_position[0][1], source_position[1][1]) == (parent_position[0][1], parent_position[1][1])


--------------------------------------------------------------------------------
/code_diff/ast.py:
--------------------------------------------------------------------------------
  1 | import code_tokenize as ct
  2 | 
  3 | from collections import defaultdict
  4 | 
  5 | # AST Node ----------------------------------------------------------------
  6 | 
  7 | 
  8 | class ASTNode(object):
  9 |     """
 10 |     A representation of an AST node together with its children
 11 | 
 12 |     Node Attributes
 13 |     ---------------
 14 |     type : str
 15 |         Syntactic type of the AST node
 16 | 
 17 |     text : str
 18 |         If this node belongs to a program token, then
 19 |         it contains the text of the program token. Otherwise, None.
 20 |     
 21 |     children : list[ASTNode]
 22 |         Potenially empty list of child nodes
 23 |     
 24 |     position : int
 25 |         If supported, the code position that is referenced by the AST node
 26 | 
 27 |     parent : ASTNode
 28 |         If not root node, the AST parent of this node.
 29 |     
 30 |     Subtree Attributes
 31 |     ------------------
 32 |     subtree_hash : str
 33 |         A hash string representing the subtree of the AST node
 34 |         Two subtrees are isomorph if they have the same subtree hash.
 35 |     
 36 |     subtree_height : int
 37 |         Longest path from this node to a leaf node
 38 | 
 39 |     subtree_weight : int
 40 |         Count of all nodes in this subtree
 41 |     
 42 |     """
 43 | 
 44 |     def __init__(self, type, text = None, position = None, parent = None, children = None):
 45 | 
 46 |         # Basic node attributes
 47 |         self.type = type
 48 |         self.children = children if children is not None else []
 49 |         self.parent   = parent
 50 |         self.text     = text   # If text is not None, then leaf node
 51 |         self.position = position
 52 | 
 53 |         # Tree based attributes
 54 |         self.subtree_hash      = None
 55 |         self.subtree_height    = 1
 56 |         self.subtree_weight    = 1
 57 | 
 58 |     def isomorph(self, other):
 59 |         return ((self.subtree_hash, self.type, self.subtree_height, self.subtree_weight) == 
 60 |                     (other.subtree_hash, other.type, other.subtree_height, other.subtree_weight))
 61 | 
 62 |     def descandents(self):
 63 |         return (t for t in self if t != self) 
 64 | 
 65 |     def sexp(self):
 66 |         name = self.text if self.text is not None else self.type
 67 | 
 68 |         child_sexp = []
 69 |         for child in self.children:
 70 |             text = child.sexp()
 71 |             text = ["  " + t for t in text.splitlines()]
 72 |             child_sexp.append("\n".join(text))
 73 |         
 74 |         if len(child_sexp) == 0:
 75 |             return name
 76 | 
 77 |         return "%s {\n%s\n}" % (name, " ".join(child_sexp))
 78 |         
 79 |     def __iter__(self):
 80 |         def _self_bfs_search():
 81 |             queue = [self]
 82 |             while len(queue) > 0:
 83 |                 current = queue.pop(0)
 84 |                 yield current
 85 |                 queue.extend(current.children)
 86 | 
 87 |         return _self_bfs_search()
 88 | 
 89 |     def __repr__(self):
 90 |         attrs = {"type": self.type, "text": self.text}
 91 |         return "ASTNode(%s)" % (", ".join(["%s=%s" % (k, v) for k, v in attrs.items() if v is not None]))
 92 | 
 93 | 
 94 | def default_create_node(type, children, text = None, position = None):
 95 |     new_node = ASTNode(type, text = text, position = position, children = children)
 96 | 
 97 |     # Subtree metrics
 98 |     height = 1
 99 |     weight = 1
100 |     hash_str = []
101 | 
102 |     for child in children:
103 |         child.parent = new_node # Set parent relation
104 |         height       = max(child.subtree_height + 1, height)
105 |         weight      += child.subtree_weight
106 |         hash_str.append(str(child.subtree_hash))
107 |     
108 |     new_node.subtree_height = height
109 |     new_node.subtree_weight = weight
110 | 
111 |     # WL hash subtree representation
112 |     base_str = new_node.type if new_node.text is None else new_node.text
113 |     hash_str.insert(0, base_str)
114 |     hash_str = "_".join(hash_str)
115 |     new_node.subtree_hash = hash(hash_str)
116 | 
117 |     return new_node
118 | 
119 | 
120 | def _node_key(node):
121 |     return (node.type, node.start_point, node.end_point)
122 | 
123 | 
124 | class TokensToAST:
125 | 
126 |     def __init__(self, create_node_fn):
127 |         self.create_node_fn = create_node_fn
128 | 
129 |         self.root_node = None
130 |         self.waitlist = []
131 |         self.node_index = {}
132 |         self.child_count = defaultdict(int)
133 | 
134 |     def _create_node(self, ast_node, text = None):
135 | 
136 |         if ast_node.type == "comment": return # We ignore comments
137 | 
138 |         node_key = _node_key(ast_node)
139 |         children = [self.node_index[_node_key(c)] for c in ast_node.children
140 |                      if _node_key(c) in self.node_index]
141 | 
142 |         position = (ast_node.start_point, ast_node.end_point)
143 |         current_node = self.create_node_fn(ast_node.type, children, text = text, position = position)
144 |         current_node.backend = ast_node
145 | 
146 |         self.node_index[node_key] = current_node
147 | 
148 |         # Add parent if ready
149 |         if ast_node.parent:
150 |             parent_ast = ast_node.parent
151 |             parent_key = _node_key(parent_ast)
152 |             self.child_count[parent_key] += 1
153 | 
154 |             if len(parent_ast.children) == self.child_count[parent_key]:
155 |                 self.waitlist.append(parent_ast)
156 | 
157 |         else:
158 |             self.root_node = current_node
159 | 
160 | 
161 |     def _open_node(self, node):
162 |         node_key = _node_key(node)
163 |         if node_key in self.node_index: return False
164 | 
165 |         opened = False
166 |         for c in node.children:
167 |             opened = opened or self._open_node(c)
168 |         
169 |         if not opened:
170 |             self.waitlist.append(node)
171 |             return True
172 |         
173 |         return False
174 | 
175 |     def _open_root_if_not_complete(self, base_node):
176 |         
177 |         root = base_node
178 |         while root.parent is not None:
179 |             root = root.parent
180 |         
181 |         for c in root.children:
182 |             self._open_node(c)
183 | 
184 |     def __call__(self, tokens):
185 |         
186 |         token_nodes = ((t.text, t.ast_node) for t in tokens if hasattr(t, "ast_node"))
187 |         for token_text, token_ast in token_nodes:
188 |             self._create_node(token_ast, text = token_text)
189 | 
190 |         while self.root_node is None:
191 |             while len(self.waitlist) > 0:
192 |                 current_node = self.waitlist.pop(0)
193 |                 self._create_node(current_node)
194 | 
195 |             self._open_root_if_not_complete(current_node)
196 | 
197 |         print(self.root_node.sexp())
198 |         
199 |         return self.root_node
200 | 
201 | 
202 | 
203 | class BottomUpParser:
204 | 
205 |     def __init__(self, create_node_fn):
206 |         
207 |         self.create_node_fn = create_node_fn
208 |         self.waitlist    = [] # Invariant: All children have been processed
209 |         self.open_index  = {} 
210 |         self.node_index  = {} # Nodes that have been processed
211 | 
212 |     def _should_ignore(self, node):
213 |         return node.type == "comment"
214 | 
215 |     def _add_to_waitlist(self, node):
216 |         if self._should_ignore(node): return
217 | 
218 |         node_key = _node_key(node)
219 | 
220 |         if node_key not in self.node_index and node_key not in self.open_index:
221 |             self.open_index[node_key] = node
222 |             self.waitlist.append(node)
223 | 
224 | 
225 |     def _init_lists(self, tokens):
226 |         
227 |         for token in tokens:
228 |             if hasattr(token, 'ast_node'):
229 |                 ast_node = token.ast_node
230 |                 if self._should_ignore(ast_node): continue
231 |                 self.open_index[_node_key(ast_node)] = ast_node
232 |                 self._create_node(ast_node, token.text)
233 | 
234 |         if ast_node is None: return
235 | 
236 |         # Get to root
237 |         root = ast_node
238 |         while root.parent is not None:
239 |             root = root.parent
240 | 
241 |         self._open_descandents(root)
242 | 
243 |         return root
244 | 
245 |     def _open_descandents(self, node):
246 | 
247 |         queue = [node]
248 |         while len(queue) > 0:
249 |             current_node = queue.pop(0)
250 | 
251 |             has_opened = False
252 |             for child in current_node.children:
253 |                 if _node_key(child) not in self.node_index:
254 |                     has_opened = True
255 |                     queue.append(child)
256 |             
257 |             if not has_opened: 
258 |                 self._add_to_waitlist(current_node)
259 | 
260 | 
261 |     def _open_parent(self, ast_node):
262 |         parent = ast_node.parent
263 | 
264 |         if all(_node_key(c) in self.node_index for c in parent.children if not self._should_ignore(c)):
265 |             self._add_to_waitlist(parent)
266 | 
267 |     def _create_node(self, ast_node, text = None):
268 | 
269 |         node_key = _node_key(ast_node)
270 |         children = [self.node_index[_node_key(c)] for c in ast_node.children
271 |                      if _node_key(c) in self.node_index]
272 | 
273 |         position = (ast_node.start_point, ast_node.end_point)
274 |         current_node = self.create_node_fn(ast_node.type, children, text = text, position = position)
275 |         current_node.backend = ast_node
276 | 
277 |         self.node_index[node_key] = current_node
278 |         del self.open_index[node_key]
279 |         
280 |         if ast_node.parent: self._open_parent(ast_node)
281 | 
282 | 
283 |     def __call__(self, tokens):
284 |         root_node = self._init_lists(tokens)
285 | 
286 |         while len(self.waitlist) > 0:
287 |             self._create_node(self.waitlist.pop(0))
288 |         
289 |         if _node_key(root_node) not in self.node_index:
290 |             return None
291 | 
292 |         return self.node_index[_node_key(root_node)]
293 | 
294 | 
295 |     
296 | 
297 | # Interface ----------------------------------------------------------------
298 | 
299 | 
300 | def parse_ast(source_code, lang = "guess", **kwargs):
301 |     """
302 |     Parses a given source code string into its AST
303 | 
304 |     Function to parse source code in the given language
305 |     into its AST. As a backend, we employ
306 |     code_tokenize (tree-sitter). The final
307 |     AST is additionally analyzed to compute
308 |     additional annotations
309 | 
310 |     Parameters
311 |     ----------
312 |     source_code : str
313 |         Source code snippet as a string
314 |     
315 |     lang : [python, java, javascript, ...]
316 |         Language to parse the given source code
317 |         Default: guess (Currently not supported; will raise error)
318 | 
319 |     Returns
320 |     -------
321 |     ASTNode
322 |         the root node of the computed AST
323 |     
324 |     """
325 |     
326 |     # Parse AST 
327 |     kwargs["lang"] = lang
328 |     kwargs["syntax_error"] = "ignore"
329 | 
330 |     ast_tokens = ct.tokenize(source_code, **kwargs)
331 |     
332 |     return BottomUpParser(default_create_node)(ast_tokens)


--------------------------------------------------------------------------------
/code_diff/diff_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | # Diff parsing -----------------------------------------------------------------
  5 | 
  6 | class Hunk:
  7 |     
  8 |     def __init__(self, lines, added_lines, rm_lines, header = None):
  9 |         self.lines       = lines
 10 |         self.added_lines = set(added_lines)
 11 |         self.rm_lines    = set(rm_lines)
 12 |         self.header      = header
 13 |         
 14 |         
 15 |     @property
 16 |     def after(self):
 17 |         
 18 |         alines = []
 19 |         
 20 |         for i, line in enumerate(self.lines):
 21 |             if i in self.rm_lines: continue
 22 |             if i in self.added_lines:
 23 |                 alines.append(" " + line[1:])
 24 |             else:
 25 |                 alines.append(line)
 26 |                 
 27 |         return "".join(alines)
 28 |         
 29 |         
 30 |     @property
 31 |     def before(self):
 32 |         
 33 |         alines = []
 34 |         
 35 |         for i, line in enumerate(self.lines):
 36 |             if i in self.added_lines: continue
 37 |             if i in self.rm_lines:
 38 |                 alines.append(" " + line[1:])
 39 |             else:
 40 |                 alines.append(line)
 41 |                 
 42 |         return "".join(alines)
 43 |         
 44 |     def __repr__(self):
 45 | 
 46 |         if self.header:
 47 |             return self.header + "".join(self.lines)
 48 | 
 49 |         return "".join(self.lines)
 50 | 
 51 |     
 52 | def _parse_hunk(lines, start, end):
 53 |     
 54 |     hunk_lines = lines[start + 1:end]
 55 |      
 56 |     added_lines = []
 57 |     rm_lines    = []
 58 |     
 59 |     for i, hline in enumerate(hunk_lines):
 60 |         if hline.startswith("+"): added_lines.append(i)
 61 |         if hline.startswith("-"): rm_lines.append(i)
 62 |     
 63 |     return Hunk(hunk_lines, added_lines, rm_lines, header = lines[start])
 64 |     
 65 | 
 66 | hunk_pat = re.compile("@@ -(\d+)(,\d+)? \+(\d+)(,\d+)? @@.*")
 67 |         
 68 | def parse_hunks(diff):
 69 |     lines = diff.splitlines(True)
 70 |     
 71 |     hunks = []
 72 |     
 73 |     start_ix = -1
 74 |     end_ix   = -1
 75 |     
 76 |     for line_ix, line in enumerate(lines):
 77 |         
 78 |         if hunk_pat.match(line):
 79 |             
 80 |             end_ix = line_ix - 1
 81 |             
 82 |             if start_ix >= 0 and start_ix < end_ix: 
 83 |                 hunks.append(_parse_hunk(lines, start_ix, end_ix))
 84 |             
 85 |             start_ix = line_ix
 86 |     
 87 |     end_ix = len(lines)
 88 |     
 89 |     if start_ix >= 0 and start_ix < end_ix: 
 90 |         hunks.append(_parse_hunk(lines, start_ix, end_ix))
 91 |                 
 92 |     return hunks
 93 | 
 94 | 
 95 | # Diff cleaning --------------------------------
 96 | 
 97 | def _has_incomplete_comment(lines):
 98 |     is_incomplete2 = False
 99 |     is_incomplete1 = False
100 | 
101 |     for line in lines:
102 |         count2 = line.count("\"\"\"")
103 |         if count2 % 2 == 1: is_incomplete2 = not is_incomplete2
104 |         
105 |         count1 = line.count("\'\'\'")
106 |         if count1 % 2 == 1: is_incomplete1 = not is_incomplete1
107 | 
108 |     return is_incomplete1 or is_incomplete2
109 | 
110 | 
111 | def _determine_incomplete_comment(lines):
112 |     last_incomplete2 = -1
113 |     last_incomplete1 = -1
114 | 
115 |     for i, line in enumerate(lines):
116 |         count2 = line.count("\"\"\"")
117 |         if count2 % 2 == 1:
118 |             last_incomplete2 = i if last_incomplete2 == -1 else -1
119 |         
120 |         count1 = line.count("\'\'\'")
121 |         if count1 % 2 == 1:
122 |             last_incomplete1 = i if last_incomplete1 == -1 else -1
123 | 
124 |     assert last_incomplete1 != -1 or last_incomplete2 != -1
125 | 
126 |     last_incomplete = last_incomplete2 if last_incomplete2 != -1 else last_incomplete1
127 | 
128 |     dist_to_end = len(lines) - last_incomplete
129 | 
130 |     if last_incomplete < dist_to_end:
131 |         return last_incomplete + 1, len(lines)
132 |     else:
133 |         return 0, last_incomplete
134 | 
135 | 
136 | def clean_hunk(hunk):
137 |     if not _has_incomplete_comment(hunk.lines): return hunk
138 |     start, end = _determine_incomplete_comment(hunk.lines)
139 | 
140 |     new_lines = hunk.lines[start:end]
141 |     added_lines = [l - start for l in hunk.added_lines if l >= start and l < end]
142 |     rm_lines    = [l - start for l in hunk.rm_lines if l >= start and l < end]
143 | 
144 |     return Hunk(new_lines, added_lines, rm_lines, header = hunk.header)
145 | 
146 | 


--------------------------------------------------------------------------------
/code_diff/gumtree/__init__.py:
--------------------------------------------------------------------------------
 1 | from .isomap   import gumtree_isomap
 2 | from .editmap  import gumtree_editmap
 3 | from .chawathe import compute_chawathe_edit_script
 4 | from .ops      import (Update, Insert, Delete, Move)
 5 | from .ops      import EditScript
 6 | from .ops      import serialize_script, deserialize_script
 7 | from .ops      import json_serialize, json_deserialize
 8 | 
 9 | # Edit script ----------------------------------------------------------------
10 | 
11 | def compute_edit_script(source_ast, target_ast, min_height = 1, max_size = 1000, min_dice = 0.5):
12 | 
13 |     # If source_ast and target_ast only leaves
14 |     if len(source_ast.children) == 0 and len(target_ast.children) == 0:
15 |         return EditScript([_update_leaf(source_ast, target_ast)])
16 | 
17 |     isomap = gumtree_isomap(source_ast, target_ast, min_height)
18 | 
19 |     while len(isomap) == 0 and min_height > 0:
20 |         min_height -= 1
21 |         isomap = gumtree_isomap(source_ast, target_ast, min_height)
22 | 
23 |     editmap = gumtree_editmap(isomap, source_ast, target_ast, max_size, min_dice)
24 |     editscript = compute_chawathe_edit_script(editmap, source_ast, target_ast)
25 |     
26 |     return EditScript(editscript)
27 | 
28 |     
29 | # Update leaf ----------------------------------------------------------------
30 | 
31 | def _update_leaf(source_ast, target_ast):
32 |     return Update(source_ast, target_ast.text)


--------------------------------------------------------------------------------
/code_diff/gumtree/chawathe.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | 
  3 | from .ops   import Update, Insert, Delete, Move
  4 | from .utils import bfs_traversal, postorder_traversal
  5 | 
  6 | # API method ----------------------------------------------------------------
  7 | 
  8 | def compute_chawathe_edit_script(editmap, source, target):
  9 | 
 10 |     edit_script = []
 11 | 
 12 |     source_root, source_parent = _fake_root(source)
 13 |     target_root, target_parent = _fake_root(target)
 14 |     editmap.add(source_root, target_root)
 15 | 
 16 |     wt = WorkingTree(editmap)
 17 |     wt[source].mod_parent = wt[source_root] # Inject fake root only for working copy
 18 | 
 19 |     for target_node in bfs_traversal(target): 
 20 | 
 21 |         if target_node == target: # Script might start in the middle of AST
 22 |             parent = target_root
 23 |         else:
 24 |             parent = target_node.parent
 25 | 
 26 |         if parent is None: parent = target_root
 27 | 
 28 |         source_partner = wt.partner(target_node)
 29 |         parent_partner = wt.partner(parent)
 30 | 
 31 |         if source_partner is None:
 32 |             k = wt.position(target_node)
 33 |             op = Insert(
 34 |                     parent_partner.delegate,
 35 |                     (target_node.type, target_node.text),
 36 |                     k,
 37 |                     -1
 38 |                 )
 39 |             edit_script.append(op)
 40 |             node = parent_partner.apply(op)
 41 |             editmap.add(node, target_node)
 42 |         
 43 |         elif target_node.parent is not None:
 44 | 
 45 |             if target_node.text is not None and source_partner.text != target_node.text:
 46 |                 op = Update(source_partner.delegate, target_node.text)
 47 |                 edit_script.append(op)
 48 |                 source_partner.apply(op)
 49 |             
 50 |             partner_parent = source_partner.parent
 51 | 
 52 |             if not editmap[partner_parent.delegate, parent]:
 53 |                 k = wt.position(target_node)
 54 |                 op = Move(
 55 |                     parent_partner.delegate,
 56 |                     source_partner.delegate,
 57 |                     k
 58 |                 )
 59 |                 edit_script.append(op)
 60 |                 parent_partner.apply(op)
 61 |         
 62 |         target_node.inorder = True
 63 |         for move in _align_children(wt.partner(target_node), target_node, wt):
 64 |             edit_script.append(move)
 65 | 
 66 |     for node in postorder_traversal(source):
 67 |         node = wt[node]
 68 |         partner = node.partner
 69 |         if partner is None:
 70 |             op = Delete(node.delegate)
 71 |             edit_script.append(op)
 72 |             node.apply(op)
 73 | 
 74 |     # Change root back after edit
 75 |     source.parent = source_parent
 76 |     target.parent = target_parent
 77 | 
 78 |     return edit_script
 79 | 
 80 | 
 81 | # Alignment ------------------------------------------------------------------
 82 | 
 83 | def _longest_common_subsequence(source, target, equal_fn):
 84 | 
 85 |     lengths = [[0] * (len(target)+1) for _ in range(len(source)+1)]
 86 |     for i, x in enumerate(source):
 87 |         for j, y in enumerate(target):
 88 |             if equal_fn(x, y):
 89 |                 lengths[i+1][j+1] = lengths[i][j] + 1
 90 |             else:
 91 |                 lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
 92 | 
 93 |     result = []
 94 | 
 95 |     # Backtrack
 96 |     i, j = len(source), len(target)
 97 |     while i > 0 and j > 0:
 98 |         if equal_fn(source[i - 1], target[j - 1]):
 99 |             result.append((source[i - 1], target[j - 1]))
100 |             i -= 1
101 |             j -= 1
102 |         else:
103 |             if lengths[i][j - 1] > lengths[i - 1][j]:
104 |                 j -= 1
105 |             elif lengths[i][j - 1] == lengths[i - 1][j]:
106 |                 # Heuristic we like to select terminal nodes for LCS
107 | 
108 |                 if source[i - 1].text is None:
109 |                     i -= 1
110 |                 else:
111 |                     j -= 1
112 | 
113 |             else:
114 |                 i -= 1
115 |  
116 |     return result[::-1]
117 | 
118 | 
119 | def _align_children(source, target, wt):
120 |     for c in source.children: c.inorder = False
121 |     for c in target.children: c.inorder = False
122 | 
123 |     def _partner_child(c, o, src_partner = False):
124 |         p = wt.partner(c) if src_partner else c.partner
125 |         if p is None: return False
126 |         return p.parent == o
127 | 
128 |     S1 = [c for c in source.children if _partner_child(c, target)]
129 |     S2 = [c for c in target.children if _partner_child(c, source, True)]
130 | 
131 |     S = _longest_common_subsequence(S1, S2, lambda x, y: wt.isomap[x.delegate, y])
132 | 
133 |     SM = set()
134 | 
135 |     for a, b in S:
136 |         a.inorder = True
137 |         b.inorder = True
138 |         SM.add((a, b))
139 | 
140 |     for a, b in itertools.product(S1, S2):
141 |         if wt.isomap[a.delegate, b] and (a, b) not in SM:
142 |             k = wt.position(b)
143 |             op = Move(a.delegate, source.delegate, k)
144 |             yield op
145 |             source.apply(op)
146 |             a.inorder = True
147 |             b.inorder = True
148 | 
149 | # Working tree ----------------------------------------------------------------
150 | # A tree to capture all AST modifications during edit
151 | 
152 | class InsertNode:
153 | 
154 |     INSERT_COUNT = 0
155 | 
156 |     def __init__(self, type, text = None, children = None):
157 |         self.type = type
158 |         self.text = text
159 | 
160 |         self.node_id = InsertNode.INSERT_COUNT
161 |         InsertNode.INSERT_COUNT += 1
162 | 
163 |         self.parent = None 
164 |         self.children = children if children is not None else []
165 | 
166 |     def __repr__(self):
167 |         output = {"type": self.type, "text": self.text}
168 |         return "IN(%s)" % ", ".join(["%s=%s" % (k, v) for k, v in output.items() if v is not None])
169 | 
170 | 
171 | def _fake_root(root):
172 |     node = InsertNode("root", None, [root])
173 |     node.parent = None
174 | 
175 |     old_parent = root.parent
176 |     root.parent = node
177 | 
178 |     return node, old_parent
179 | 
180 | 
181 | class WorkingNode:
182 | 
183 |     def __init__(self, src, delegate):
184 |         self.src = src
185 |         self.delegate = delegate
186 | 
187 |         self.text = self.delegate.text
188 |         self.mod_parent = None
189 |         self.mod_children = None
190 | 
191 |         self.mod_partner = None
192 | 
193 |     @property
194 |     def parent(self):
195 | 
196 |         if self.mod_parent is None:
197 |             self.mod_parent = self.src._access_wn(self.delegate.parent)
198 | 
199 |         return self.mod_parent
200 | 
201 |     @property
202 |     def children(self):
203 |         if self.mod_children is None:
204 |             self.mod_children = [self.src._access_wn(c) for c in self.delegate.children]
205 | 
206 |         return self.mod_children
207 | 
208 | 
209 |     @property
210 |     def partner(self):
211 | 
212 |         if self.mod_partner is None:
213 |             node = self.delegate
214 | 
215 |             if node is None: return None
216 | 
217 |             result = self.isomap[node, None]
218 |             result = next(result, None)
219 | 
220 |             if result is None: return None
221 | 
222 |             self.mod_partner = result[1]
223 | 
224 |         return self.mod_partner
225 | 
226 |     @property
227 |     def isomap(self):
228 |         return self.src.isomap
229 | 
230 | 
231 |     def apply(self, operation):
232 |         
233 |         if isinstance(operation, Insert):
234 |             node = InsertNode(*operation.node)
235 |             operation.insert_id = node.node_id
236 |             wn   = self.src._access_wn(node)
237 |             self.children.insert(operation.position, wn)
238 | 
239 |             node.parent = self.delegate
240 | 
241 |             return node
242 |         
243 |         if isinstance(operation, Update):
244 |             self.text = operation.value
245 |             return
246 | 
247 |         if isinstance(operation, Delete):
248 |             node = operation.target_node
249 |             node = self.src._access_wn(node)
250 | 
251 |             for n, child in enumerate(node.parent.children):
252 |                 if child == node: break
253 |         
254 |             del node.parent.mod_children[n]
255 |             return
256 | 
257 |         if isinstance(operation, Move):
258 |             insert_node = operation.node 
259 | 
260 |             self.apply(Delete(insert_node))
261 | 
262 |             wn   = self.src._access_wn(insert_node)
263 |             self.children.insert(operation.position, wn)
264 | 
265 |             wn.mod_parent = self
266 | 
267 |             return insert_node
268 | 
269 | 
270 | class WorkingTree:
271 | 
272 |     def __init__(self, isomap):
273 |         self.isomap = isomap
274 |         self.node_to_wn = {}
275 | 
276 |     def _access_wn(self, source_node):
277 |         if source_node is None: return None
278 | 
279 |         if isinstance(source_node, WorkingNode):
280 |             return source_node
281 | 
282 |         if source_node not in self.node_to_wn:
283 |             self.node_to_wn[source_node] = WorkingNode(self, source_node)
284 | 
285 |         return self.node_to_wn[source_node]
286 | 
287 |     def __getitem__(self, key):
288 |         return self._access_wn(key)
289 | 
290 | 
291 |     def partner(self, target_node): 
292 |         if target_node is None: return None
293 | 
294 |         result = self.isomap[None, target_node]
295 |         result = next(result, None)
296 | 
297 |         if result is None: return None
298 | 
299 |         source_node = result[0]
300 |         wn = self._access_wn(source_node)
301 |         wn.mod_partner = target_node
302 |         return wn
303 | 
304 |     def position(self, target_node):
305 |         parent = target_node.parent
306 | 
307 |         if parent is None: return 0
308 | 
309 |         for n, child in enumerate(parent.children):
310 |             if child == target_node: break
311 | 
312 |         if all(not c.inorder for c in parent.children[:n]):
313 |             return 0
314 | 
315 |         left_child = parent.children[n - 1]
316 |         while not left_child.inorder:
317 |             n -= 1
318 |             left_child = parent.children[n - 1]
319 | 
320 |         left_partner = self.partner(left_child)
321 |     
322 |         for n, child in enumerate(left_partner.parent.children):
323 |             if child == left_partner: break
324 | 
325 |         return sum(1 for c in parent.children[:n] if c.inorder) + 1
326 | 


--------------------------------------------------------------------------------
/code_diff/gumtree/editmap.py:
--------------------------------------------------------------------------------
  1 | from apted import APTED, Config
  2 | 
  3 | from .utils import subtree_dice, postorder_traversal
  4 | 
  5 | # Minimal edit mapping to make source isomorph to target -------------------
  6 | 
  7 | # We compute a mapping between source and target tree
  8 | # If a source node is mapped to a target node with different label,
  9 | #  the source node has to be updated with the target label
 10 | # If a source node is unmapped,
 11 | #  the source node has to be deleted
 12 | # If a target node is unmapped,
 13 | #   the target node has to be added to the source tree
 14 | #
 15 | # Edits are chosen to be (approximately) minimal
 16 | 
 17 | # API method -------------------------------------------------------------
 18 | 
 19 | def gumtree_editmap(isomap, source, target, max_size = 1000, min_dice = 0.5):
 20 |     # Caution: This method does change the isomap
 21 |     if len(isomap) == 0: return isomap
 22 | 
 23 |     for source_node in postorder_traversal(source):
 24 | 
 25 |         if source_node == source: # source_node is root
 26 |             isomap.add(source_node, target)
 27 | 
 28 |             for s, t in _minimal_edit(isomap, source_node, target, max_size):
 29 |                 isomap.add(s, t)
 30 | 
 31 |             break
 32 | 
 33 |         if len(source_node.children) == 0: continue # source_node is leaf
 34 |         if (source_node, None) in isomap: continue  # source_node is now mapped
 35 | 
 36 |         target_node, dice = _select_near_candidate(source_node, isomap)
 37 | 
 38 |         if target_node is None or dice <= min_dice: continue 
 39 |         
 40 |         for s, t in _minimal_edit(isomap, source_node, target_node, max_size):
 41 |             isomap.add(s, t)
 42 |         isomap.add(source_node, target_node)
 43 | 
 44 |     return isomap
 45 | 
 46 | 
 47 | 
 48 | # APTED for computing a minimal edit --------------------------------
 49 | 
 50 | class APTEDConfig(Config):
 51 | 
 52 |     def rename(self, node1, node2):
 53 |         
 54 |         if node1.type == node2.type:
 55 |             return 1 if node1.text != node2.text else 0
 56 | 
 57 |         return 1
 58 |     
 59 |     def children(self, node):
 60 |         return node.children
 61 | 
 62 | 
 63 | def _minimal_edit(isomap, source, target, max_size = 1000):
 64 |     if source.subtree_weight > max_size or target.subtree_weight > max_size: return
 65 | 
 66 |     apted = APTED(source, target, APTEDConfig())
 67 |     mapping = apted.compute_edit_mapping()
 68 | 
 69 |     for source_node, target_node in mapping:
 70 |         if source_node is None: continue
 71 |         if target_node is None: continue
 72 |         if source_node.type != target_node.type: continue
 73 | 
 74 |         if (source_node, None) in isomap: continue
 75 |         if (None, target_node) in isomap: continue
 76 | 
 77 |         yield source_node, target_node
 78 | 
 79 | 
 80 | # Select node heuristically that is close to isomorph --------------------
 81 | 
 82 | def _select_near_candidate(source_node, mapping):
 83 | 
 84 |     dst_seeds = []
 85 | 
 86 |     for src in source_node.descandents():
 87 |         for _, dst in mapping[src, None]:
 88 |             dst_seeds.append(dst)
 89 | 
 90 |     candidates = []
 91 |     seen = set()
 92 | 
 93 |     for dst in dst_seeds:
 94 |         while dst.parent is not None:
 95 |             parent = dst.parent
 96 |             if parent in seen: break
 97 |             seen.add(parent)
 98 | 
 99 |             if (parent.type == source_node.type
100 |                     and parent.parent is not None
101 |                     and (None, parent) not in mapping):
102 |                 candidates.append(parent)
103 |             dst = parent
104 | 
105 |     if len(candidates) == 0: return None, 0.0
106 | 
107 |     candidates = [(x, subtree_dice(source_node, x, mapping)) for x in candidates]
108 | 
109 |     return max(candidates, key=lambda x: x[1])


--------------------------------------------------------------------------------
/code_diff/gumtree/isomap.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import heapq
  3 | import itertools
  4 | import math
  5 | 
  6 | from collections import defaultdict
  7 | 
  8 | from .utils import NodeMapping, subtree_dice
  9 | 
 10 | # API method ----------------------------------------------------------------
 11 | 
 12 | def gumtree_isomap(source_ast, target_ast, min_height = 1):
 13 | 
 14 |     isomorphic_mapping = NodeMapping()
 15 |     candidate_mapping  = NodeMapping()
 16 | 
 17 |     source_index = _index_iso_nodes(source_ast)
 18 |     target_index = _index_iso_nodes(target_ast)
 19 | 
 20 |     source_open = HeightPriorityHeap(source_ast)
 21 |     target_open = HeightPriorityHeap(target_ast)
 22 | 
 23 |     while max(source_open.max(), target_open.max()) > min_height:
 24 | 
 25 |         if source_open.max() > target_open.max():
 26 |             for c in list(source_open.pop()):
 27 |                 _open_node(source_open, c)
 28 |             continue
 29 |             
 30 |         if source_open.max() < target_open.max():
 31 |             for c in list(target_open.pop()):
 32 |                 _open_node(target_open, c)
 33 |             continue
 34 | 
 35 |         source_candidates, target_candidates = list(source_open.pop()), list(target_open.pop())
 36 | 
 37 |         for source_node, target_node in itertools.product(source_candidates, target_candidates):
 38 |             # Source node and Target node have the same height
 39 |             # Check if source node is isomorph to target node
 40 | 
 41 |             if source_node.isomorph(target_node):
 42 |                 # Check if there exists more candidates
 43 |                 if (source_index[source_node] > 1
 44 |                         or target_index[target_node] > 1):
 45 |                         candidate_mapping.add(source_node, target_node)
 46 |                 else:
 47 |                     # We can savely map both nodes and all descandents
 48 |                     _map_recursively(isomorphic_mapping, source_node, target_node)
 49 | 
 50 |         # Open all unmapped nodes
 51 |         for source_node in source_candidates:
 52 |             if ((source_node, None) not in isomorphic_mapping
 53 |                 and (source_node, None) not in candidate_mapping):
 54 |                 _open_node(source_open, source_node)
 55 | 
 56 |         for target_node in target_candidates:
 57 |             if ((None, target_node) not in isomorphic_mapping
 58 |                 and (None, target_node) not in candidate_mapping):
 59 |                 _open_node(target_open, target_node)
 60 | 
 61 |     # Select the heuristically best mapping for all isomorphic pairs
 62 |     selection_heuristic = create_default_heuristic(isomorphic_mapping)
 63 |     for source_node, target_node in _select_candidates(candidate_mapping, selection_heuristic):
 64 |         _map_recursively(isomorphic_mapping, source_node, target_node)
 65 | 
 66 |     return isomorphic_mapping
 67 | 
 68 | 
 69 | # Collections ----------------------------------------------------------------
 70 | 
 71 | class NodeCounter:
 72 | 
 73 |     def __init__(self):
 74 |         self._counter = defaultdict(int)
 75 | 
 76 |     def _node_key(self, node):
 77 |         return (node.subtree_hash, node.subtree_weight)
 78 | 
 79 |     def __getitem__(self, node):
 80 |         return self._counter[self._node_key(node)]
 81 |     
 82 |     def __setitem__(self, node, value):
 83 |         self._counter[self._node_key(node)] = value
 84 | 
 85 | 
 86 | class HeightPriorityHeap:
 87 | 
 88 |     def __init__(self, start_node = None):
 89 |         self._heap = []
 90 |         self.element_count = 0
 91 | 
 92 |         if start_node is not None:
 93 |             self.push(start_node)
 94 | 
 95 |     def __len__(self):
 96 |         return len(self._heap)
 97 | 
 98 |     def push(self, x, seed = 0):
 99 |         try:
100 |             heapq.heappush(self._heap, (-x.subtree_height, x.subtree_hash, self.element_count, seed, x))
101 |             self.element_count += 1
102 |         except TypeError:
103 |             # Typically the type error occurs if we compare with the last element in tuple (Node)
104 |             # If this happens the node is already contained in the heap and we skip this push
105 |             return
106 |     
107 |     def max(self):
108 |         if len(self) == 0: return 0
109 |         return -self._heap[0][0]
110 | 
111 |     def pop(self):
112 |         current_head = self.max()
113 | 
114 |         while len(self) > 0 and self.max() == current_head:
115 |             yield heapq.heappop(self._heap)[-1]
116 | 
117 | # Helper methods -----------------------------------------------------------
118 | 
119 | def _index_iso_nodes(ast):
120 |     result = NodeCounter()
121 |     for node in ast: result[node] += 1
122 | 
123 |     return result
124 | 
125 | def _open_node(heap, node):
126 |     for n, child in enumerate(node.children):
127 |         heap.push(child, seed = n)
128 | 
129 | def _map_recursively(mapping, source_node, target_node):
130 |     mapping.add(source_node, target_node)
131 | 
132 |     for i, source_child in enumerate(source_node.children):
133 |         target_child = target_node.children[i]
134 |         assert source_node.type == target_node.type
135 | 
136 |         _map_recursively(mapping, source_child, target_child)
137 | 
138 | # Heuristic selection ----------------------------------------------------------------
139 | 
140 | 
141 | def source_distance(source_node, target_node):
142 |    
143 |     max_token_mover = 1000
144 | 
145 |     line_mover_distance = source_node.position[0][0] - target_node.position[1][0]
146 |     line_mover_distance = line_mover_distance * max_token_mover
147 | 
148 |     if line_mover_distance == 0:
149 |         token_mover_distance = min(abs(source_node.position[0][1] - target_node.position[0][1]), max_token_mover - 1) 
150 |         line_mover_distance += token_mover_distance
151 | 
152 |     return -line_mover_distance
153 | 
154 | 
155 | 
156 | def create_default_heuristic(isomorphic_mapping):
157 | 
158 |     def _heuristic(source_node, target_node):
159 |         return (subtree_dice(source_node, target_node, isomorphic_mapping), source_distance(source_node, target_node))
160 |     
161 |     return _heuristic
162 | 
163 | 
164 | def _select_candidates(candidate_mapping, heuristic = None):
165 |     if len(candidate_mapping) == 0: return
166 | 
167 |     candidate_pairs = [(s, t) for s, t in candidate_mapping]
168 | 
169 |     if heuristic is not None:
170 |         candidate_pairs = sorted(candidate_pairs, 
171 |                                     key=lambda p: heuristic(*p), 
172 |                                     reverse=True)
173 |     
174 |     source_seen = set()
175 |     target_seen = set()
176 | 
177 |     while len(candidate_pairs) > 0:
178 |         source_node, target_node = candidate_pairs.pop(0)
179 | 
180 |         if source_node in source_seen:
181 |             continue
182 |         source_seen.add(source_node)
183 | 
184 |         if target_node in target_seen:
185 |             continue
186 |         target_seen.add(target_node)
187 |         
188 |         yield source_node, target_node
189 | 
190 | 


--------------------------------------------------------------------------------
/code_diff/gumtree/ops.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import dataclass
  3 | from typing import Any, Tuple
  4 | 
  5 | @dataclass
  6 | class EditOperation:
  7 |     target_node: Any
  8 | 
  9 | @dataclass
 10 | class Update(EditOperation):
 11 |     value: Any
 12 | 
 13 | @dataclass
 14 | class Insert(EditOperation):
 15 |     node: Tuple[str, Any]
 16 |     position: int
 17 |     insert_id: int # This is necessary to keep track of nodes (TODO: Better solution?)
 18 | 
 19 | @dataclass
 20 | class Move(EditOperation):
 21 |     node: Any
 22 |     position: int
 23 | 
 24 | @dataclass
 25 | class Delete(EditOperation):
 26 |     pass
 27 | 
 28 | # Edit script ----------------------------------------------------------------
 29 | 
 30 | class EditScript(list):
 31 | 
 32 |     def __init__(self, operations):
 33 |         super().__init__(operations)
 34 | 
 35 |     def __repr__(self):
 36 |         return serialize_script(self, indent = 2)
 37 | 
 38 | 
 39 | # Serialization --------------------------------
 40 | 
 41 | 
 42 | def _serialize_new_node(new_node_index, node):
 43 |     
 44 |     if node.node_id not in new_node_index:
 45 |         new_node_index[node.node_id] = len(new_node_index)
 46 | 
 47 |     return "N%d" % new_node_index[node.node_id]
 48 | 
 49 | def _serialize_ast_node(node):
 50 |    position  = node.position
 51 |    node_text = node.type
 52 | 
 53 |    if node.text: node_text += ":" + node.text
 54 |    
 55 |    return "(%s, line %d:%d - %d:%d)" % (node_text, position[0][0], position[0][1], position[1][0], position[1][1]) 
 56 | 
 57 | 
 58 | def _serialize_node(new_node_index, node):
 59 |     
 60 |     if hasattr(node, 'node_id'):
 61 |         return _serialize_new_node(new_node_index, node)
 62 |     
 63 |     return _serialize_ast_node(node)
 64 | 
 65 | 
 66 | def serialize_script(edit_script, indent = 0):
 67 |     
 68 |     sedit_script = []
 69 |     new_node_index = {}
 70 | 
 71 |     for operation in edit_script:
 72 | 
 73 |         operation_name = operation.__class__.__name__
 74 |         target_node_str = _serialize_node(new_node_index, operation.target_node)
 75 | 
 76 |         if operation_name == "Update":
 77 |             sedit_script.append("%s(%s, %s)" % (operation_name, target_node_str, operation.value))
 78 |         
 79 |         elif operation_name == "Insert":
 80 |             
 81 |             new_node = operation.node
 82 | 
 83 |             if new_node[1] is None:
 84 |                 new_node_index[operation.insert_id] = len(new_node_index)
 85 |                 new_node_str = "(%s, %s)" % (new_node[0], "N%d" % new_node_index[operation.insert_id])
 86 |             else: # Leaf node
 87 |                 new_node_str = "%s:%s" % new_node
 88 | 
 89 |             sedit_script.append("%s(%s, %s, %d)" % (operation_name, new_node_str, target_node_str, operation.position))
 90 | 
 91 |         elif operation_name == "Move":
 92 | 
 93 |             new_node_str = _serialize_node(new_node_index, operation.node)
 94 | 
 95 |             sedit_script.append("%s(%s, %s, %d)" % (operation_name, new_node_str, target_node_str, operation.position))
 96 | 
 97 |         elif operation_name == "Delete":
 98 |             sedit_script.append("%s(%s)" % (operation_name, target_node_str))
 99 | 
100 |     if indent > 0:
101 |         sedit_script = [" "*indent + e for e in sedit_script]
102 |         return "[\n%s\n]" % (",\n").join(sedit_script)
103 | 
104 |     return "[%s]" % ", ".join(sedit_script)
105 | 
106 | 
107 | 
108 | # Deserialize --------------------------------------------------------------------------------------------------------------------------------
109 | 
110 | class DASTNode:
111 | 
112 |     def __init__(self, type, position, text = None):
113 |         self.type = type
114 |         self.position = position
115 |         self.text = text
116 | 
117 |     def __repr__(self):
118 |         return "Node(%s, %s, %s)" % (self.type, str(self.text), self.position)
119 | 
120 | 
121 | class InsertNode:
122 | 
123 |     def __init__(self, node_id, type, text = None):
124 |         self.node_id = node_id
125 |         self.type    = type
126 |         self.text    = text
127 | 
128 |     def __repr__(self):
129 |         return "%s(%s, %s)" % (self.node_id, self.type, str(self.text))
130 | 
131 | 
132 | def _split_args(inst):
133 |     args = []
134 | 
135 |     bracket_open = 0
136 |     str_open     = False
137 |     for i, c in enumerate(inst):
138 | 
139 |         # Lookahead
140 |         if i > 0 and i < len(inst) - 1 and c in ["(", ")", ",", "\"", "\'"]:
141 |             if inst[i - 1] == ":" and inst[i - 2] == c: continue
142 |             if inst[i + 1] == ":" and inst[i + 2] == c: continue
143 | 
144 |         if c in ["\"", "\'"]:
145 |             str_open = not str_open
146 | 
147 |         if str_open: continue
148 | 
149 |         if c == "(": 
150 |             if bracket_open == 0: args.append(i)
151 |             bracket_open += 1
152 |             continue
153 | 
154 |         if c == ")": 
155 |             bracket_open -= 1
156 |             if bracket_open == 0: args.append(i)
157 |             continue
158 |             
159 |         if bracket_open == 1 and c == ",":
160 |             args.append(i)
161 | 
162 |     return [inst[args[i - 1] + 1: args[i]].strip() for i in range(1, len(args))]
163 | 
164 | 
165 | def _deserialize_insert_node(node_registry, node_info):
166 | 
167 |     if "(" not in node_info or node_info in ["(:(", "):)"]:
168 |         return InsertNode("T", *_parse_type(node_info))
169 | 
170 |     node_type, node_id = _split_args(node_info)
171 | 
172 |     if node_id in node_registry: return node_registry[node_id]
173 | 
174 |     insert_node = InsertNode(node_id, node_type)
175 |     node_registry[node_id] = insert_node
176 | 
177 |     return insert_node
178 | 
179 | 
180 | def _parse_type(node_type):
181 |     if ":" in node_type:
182 |         return node_type.split(":", 1)
183 |     return node_type, None
184 | 
185 | 
186 | def _deserialize_node(node_registry, node_info):
187 |     
188 |     if "(" in node_info:
189 |         ast_type, ast_position = _split_args(node_info)
190 |         ast_type, ast_text     = _parse_type(ast_type)
191 |         return DASTNode(ast_type, ast_position, text = ast_text)
192 |     
193 |     if node_info in node_registry:
194 |         return node_registry[node_info]
195 | 
196 |     return InsertNode(node_info, "unknown")
197 | 
198 | 
199 | def _deserialize_update(node_registry, inst):
200 |     target_node, update = _split_args(inst)
201 |     target_node = _deserialize_node(node_registry, target_node)
202 |     return Update(target_node, update)
203 | 
204 | 
205 | def _deserialize_insert(node_registry, inst):
206 |     new_node, target_node, position = _split_args(inst)
207 | 
208 |     new_node = _deserialize_insert_node(node_registry, new_node)
209 |     target_node = _deserialize_node(node_registry, target_node)
210 |     
211 |     return Insert(target_node, new_node, int(position), -1)
212 | 
213 | 
214 | def _deserialize_delete(node_registry, inst):
215 |     target_node = _split_args(inst)[0]
216 |     target_node = _deserialize_node(node_registry, target_node)
217 |     return Delete(target_node)
218 | 
219 | 
220 | def _deserialize_move(node_registry, inst):
221 |     from_node, to_node, position = _split_args(inst)
222 |     from_node = _deserialize_node(node_registry, from_node)
223 |     to_node   = _deserialize_node(node_registry, to_node)
224 |     return Move(to_node, from_node, int(position))
225 |         
226 | 
227 | def deserialize_script(script_string):
228 | 
229 |     instructions = script_string.split("\n")[1:-1]
230 | 
231 |     script = []
232 |     node_registry = {}
233 |     for instruction in instructions:
234 |         instruction = instruction.strip()
235 | 
236 |         if instruction.startswith("Update"):
237 |             op = _deserialize_update(node_registry, instruction)
238 |         if instruction.startswith("Insert"):
239 |             op = _deserialize_insert(node_registry, instruction)
240 |         if instruction.startswith("Delete"):
241 |             op = _deserialize_delete(node_registry, instruction)
242 |         if instruction.startswith("Move"):
243 |             op = _deserialize_move(node_registry, instruction)
244 | 
245 |         script.append(op)
246 | 
247 |     return script
248 | 
249 | 
250 | # Fast serialize -----------------------------------------------------------------------------------------------------------------------------
251 | 
252 | def _json_serialize_new_node(new_node_index, node):
253 |     
254 |     if node.node_id not in new_node_index:
255 |         new_node_index[node.node_id] = len(new_node_index)
256 | 
257 |     return "N%d" % new_node_index[node.node_id]
258 | 
259 | 
260 | def _json_serialize_ast_node(node):
261 |    position  = node.position
262 |    node_text = node.type
263 | 
264 |    if node.text: node_text += ":" + node.text
265 | 
266 |    return [node_text, position[0][0], position[0][1], position[1][0], position[1][1]]
267 | 
268 | 
269 | def _json_serialize_node(new_node_index, node):
270 |     
271 |     if hasattr(node, 'node_id'):
272 |         return _json_serialize_new_node(new_node_index, node)
273 |     
274 |     return _json_serialize_ast_node(node)
275 | 
276 | 
277 | def json_serialize(edit_script):
278 |     edit_ops = []
279 |     new_node_index = {}
280 | 
281 |     for operation in edit_script:
282 |         operation_name = operation.__class__.__name__
283 |         target_node_str = _json_serialize_node(new_node_index, operation.target_node)
284 | 
285 |         if operation_name == "Update":
286 |             edit_ops.append([operation_name, target_node_str, operation.value])
287 |         
288 |         elif operation_name == "Insert":
289 |             
290 |             new_node = operation.node
291 | 
292 |             if new_node[1] is None:
293 |                 new_node_index[operation.insert_id] = len(new_node_index)
294 |                 new_node_str = [new_node[0], "N%d" % new_node_index[operation.insert_id]]
295 |             else: # Leaf node
296 |                 new_node_str = ["%s:%s" % new_node, "T"]
297 | 
298 |             edit_ops.append([operation_name, target_node_str, new_node_str, operation.position])
299 | 
300 |         elif operation_name == "Move":
301 | 
302 |             new_node_str = _json_serialize_node(new_node_index, operation.node)
303 | 
304 |             edit_ops.append([operation_name, target_node_str, new_node_str, operation.position])
305 | 
306 |         elif operation_name == "Delete":
307 |             edit_ops.append([operation_name, target_node_str])
308 | 
309 |     return json.dumps(edit_ops)
310 | 
311 | 
312 | # Fast deserialize ----------------------------------------------------------------------
313 | 
314 | def _json_deserialize_node(node_index, node_info):
315 | 
316 |     if not isinstance(node_info, list) and node_info != "T":
317 |         node_id = int(node_info[1:])
318 |         return node_index[node_id]
319 | 
320 |     node_type, position = node_info[0], node_info[1:]
321 |     node_text = None
322 | 
323 |     if ":" in node_type:
324 |         node_type, node_text = node_type.split(":", 1)
325 | 
326 |     if len(position) == 4:
327 |         return DASTNode(node_type, ((position[0], position[1]), (position[2], position[3])), node_text)
328 | 
329 |     return InsertNode(position[0], node_type, node_text)
330 | 
331 | 
332 | def _json_deserialize_node_constructor(node_index, cn_info):
333 |     node_type, node_id = cn_info
334 |     node_text = None
335 | 
336 |     if ":" in node_type:
337 |         node_type, node_text = node_type.split(":", 1)
338 | 
339 |     if node_id != "T":
340 |         node_id = int(node_id[1:])
341 |         node_index[node_id] = InsertNode(node_id, node_type, node_text)
342 |         return node_index[node_id]
343 |     
344 |     return InsertNode(node_id, node_type, node_text)
345 | 
346 | 
347 | def _json_deserialize_update(node_index, operation):
348 |     _, target, update = operation
349 |     target = _json_deserialize_node(node_index, target)
350 |     return Update(target, update)
351 | 
352 | 
353 | def _json_deserialize_insert(node_index, operation):
354 |     _, target, new_node, position = operation
355 |     target = _json_deserialize_node(node_index, target)
356 |     new_node = _json_deserialize_node_constructor(node_index, new_node)
357 | 
358 |     return Insert(target, (new_node.type, new_node.text), position, new_node.node_id)
359 | 
360 | 
361 | def _json_deserialize_delete(node_index, operation):
362 |     return Delete(_json_deserialize_node(node_index, operation[1]))
363 | 
364 | 
365 | def _json_deserialize_move(node_index, operation):
366 |     _, target, move_node, position = operation
367 |     target = _json_deserialize_node(node_index, target)
368 |     move_node = _json_deserialize_node(node_index, move_node)
369 |     return Move(target, move_node, position)
370 |     
371 | 
372 | DESERIALIZE = {
373 |     "Update" : _json_deserialize_update,
374 |     "Insert" : _json_deserialize_insert,
375 |     "Delete" : _json_deserialize_delete,
376 |     "Move"   : _json_deserialize_move
377 | }
378 | 
379 | 
380 | def json_deserialize(edit_json):
381 |     edit_ops = json.loads(edit_json)
382 |     output   = []
383 |     node_index = {}
384 | 
385 |     for operation in edit_ops:
386 |         operation_name = operation[0]
387 |         output.append(DESERIALIZE[operation_name](node_index, operation))
388 |     
389 |     return EditScript(output)


--------------------------------------------------------------------------------
/code_diff/gumtree/utils.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | # Collections -------------------------------------------------------------------
  4 | 
  5 | class NodeMapping:
  6 | 
  7 |     def __init__(self):
  8 |         self._src_to_dst = defaultdict(set)
  9 |         self._dst_to_src = defaultdict(set)
 10 |         self._length = 0
 11 | 
 12 |     def __getitem__(self, key):
 13 |         if not isinstance(key, tuple): key = (key, None)
 14 | 
 15 |         src_key, dst_key = key
 16 | 
 17 |         if src_key is not None and dst_key is not None:
 18 |             return dst_key in self._src_to_dst[src_key]
 19 | 
 20 |         if src_key is None and dst_key is None:
 21 |             return self.__iter__()
 22 | 
 23 |         if src_key is None:
 24 |             return ((src, dst_key) for src in self._dst_to_src[dst_key])
 25 |         
 26 |         if dst_key is None:
 27 |             return ((src_key, dst) for dst in self._src_to_dst[src_key])
 28 |     
 29 |     def __iter__(self):
 30 | 
 31 |         def _iter_maps():
 32 |             for k, V in self._src_to_dst.items():
 33 |                 for v in V: yield (k, v)
 34 | 
 35 |         return _iter_maps()
 36 | 
 37 |     def __contains__(self, key):
 38 |         if not isinstance(key, tuple): key = (key, None)
 39 | 
 40 |         src_key, dst_key = key
 41 | 
 42 |         if src_key is not None and dst_key is not None:
 43 |             return self[src_key, dst_key]
 44 | 
 45 |         return next(self[src_key, dst_key], None) is not None
 46 | 
 47 |     def __len__(self):
 48 |         return self._length
 49 | 
 50 |     def add(self, src, dst):
 51 |         if not self[src, dst]:
 52 |             self._src_to_dst[src].add(dst)
 53 |             self._dst_to_src[dst].add(src)
 54 |             self._length += 1
 55 | 
 56 |     def __copy__(self):
 57 |         output = NodeMapping()
 58 | 
 59 |         for a, b in self:
 60 |             output.add(a, b)
 61 |         
 62 |         return output
 63 | 
 64 |     def __str__(self):
 65 |         approx_str = []
 66 | 
 67 |         for src, dst in self:
 68 |             approx_str.append("%s ≈ %s" % (str(src), str(dst)))
 69 |         
 70 |         return "\n".join(approx_str)
 71 | 
 72 | 
 73 | # Tree heuristic ----------------------------------------------------------------
 74 | 
 75 | def subtree_dice(A, B, mapping):
 76 | 
 77 |     if A is None or B is None:
 78 |         return 1.0 if all(x is None for x in [A, B]) else 0.0
 79 | 
 80 |     DA, DB = set(A.descandents()), set(B.descandents())
 81 | 
 82 |     norm = len(DA) + len(DB)
 83 | 
 84 |     if norm == 0: return 1.0
 85 | 
 86 |     mapped = defaultdict(set)
 87 |     for a, b in mapping: mapped[a].add(b)
 88 | 
 89 |     mapped_children = set(m for t1 in DA if t1 in mapped for m in mapped[t1])
 90 |     dice_score = len(set.intersection(mapped_children, DB))
 91 | 
 92 |     return 2 * dice_score / norm
 93 | 
 94 | 
 95 | # Tree traversal ----------------------------------------------------------------
 96 | 
 97 | def bfs_traversal(tree):
 98 |     queue = [tree]
 99 | 
100 |     while len(queue) > 0: 
101 |         node = queue.pop(0)
102 | 
103 |         yield node
104 | 
105 |         for c in node.children: 
106 |             queue.append(c)   
107 | 
108 | 
109 | def dfs_traversal(tree):
110 |     stack = [tree]
111 | 
112 |     while len(stack) > 0: 
113 |         node = stack.pop(-1)
114 | 
115 |         yield node
116 | 
117 |         for c in node.children: 
118 |             stack.append(c)   
119 | 
120 | 
121 | def postorder_traversal(tree):
122 |     
123 |     stack = [(tree, 0)]
124 | 
125 |     while len(stack) > 0:
126 |         node, ix = stack.pop(-1)
127 | 
128 |         if ix >= len(node.children):
129 |             yield node
130 |         else:
131 |             stack.append((node, ix + 1))
132 |             stack.append((node.children[ix], 0))
133 | 
134 | 


--------------------------------------------------------------------------------
/code_diff/sstubs.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | class SStubPattern(Enum):
  4 | 
  5 |     MULTI_STMT                     = 0
  6 |     SINGLE_STMT                    = 1
  7 |     SINGLE_TOKEN                   = 2
  8 |     NO_STMT                        = 3
  9 | 
 10 |     # Functions
 11 |     WRONG_FUNCTION_NAME            = 4
 12 | 
 13 |     SAME_FUNCTION_MORE_ARGS        = 5
 14 |     SAME_FUNCTION_LESS_ARGS        = 6
 15 |     SAME_FUNCTION_WRONG_CALLER     = 7
 16 |     SAME_FUNCTION_SWAP_ARGS        = 8
 17 | 
 18 |     ADD_FUNCTION_AROUND_EXPRESSION = 9
 19 |     ADD_METHOD_CALL                = 10
 20 | 
 21 |     # Changes (single token)
 22 |     CHANGE_IDENTIFIER_USED         = 11
 23 |     CHANGE_NUMERIC_LITERAL         = 12
 24 |     CHANGE_BOOLEAN_LITERAL         = 13
 25 | 
 26 |     # Change operator / operand
 27 |     CHANGE_UNARY_OPERATOR          = 14
 28 |     CHANGE_BINARY_OPERATOR         = 15 
 29 |     CHANGE_BINARY_OPERAND          = 16
 30 | 
 31 |     # Changes (Access)
 32 |     CHANGE_ATTRIBUTE_USED          = 17
 33 |     CHANGE_KEYWORD_ARGUMENT_USED   = 18
 34 |     CHANGE_CONSTANT_TYPE           = 19
 35 | 
 36 |     ADD_ELEMENTS_TO_ITERABLE       = 20
 37 |     ADD_ATTRIBUTE_ACCESS           = 21 
 38 | 
 39 |     # If condition
 40 |     MORE_SPECIFIC_IF               = 22
 41 |     LESS_SPECIFIC_IF               = 23
 42 | 
 43 |     # STRING
 44 |     CHANGE_STRING_LITERAL          = 24 # This is not a sstub pattern but helpful for scanning results
 45 | 
 46 | 
 47 | # SStub classification -------------------------------
 48 | 
 49 | def classify_sstub(source_ast, target_ast):
 50 |     # Assume tree is minimized to smallest edit
 51 | 
 52 |     classifier_fns = []
 53 | 
 54 |     if len(source_ast.children) == 0 and len(target_ast.children) == 0:
 55 |         classifier_fns.append(single_token_edit)
 56 | 
 57 |     if source_ast.parent.type == "call" and target_ast.parent.type == "call":
 58 |         source_name = _call_name(source_ast.parent)
 59 |         target_name = _call_name(target_ast.parent)
 60 | 
 61 |         if source_name == target_name:
 62 |             classifier_fns.append(same_function_mod)
 63 | 
 64 |     if (_query_path(source_ast, "if_statement", "condition")
 65 |          or _query_path(source_ast, "elif_clause", "condition")
 66 |          or _query_path(source_ast, "while_statement", "condition")):
 67 |         classifier_fns.append(change_if_statement)
 68 | 
 69 |     if source_ast.type in ["tuple", "list", "dictionary", "set"]:
 70 |         classifier_fns.append(change_iterable)
 71 | 
 72 |     if target_ast.type == "call" or target_ast.parent.type == "call":
 73 |         classifier_fns.append(add_function)
 74 | 
 75 |     if target_ast.type == "attribute":
 76 |         classifier_fns.append(add_attribute_access)
 77 | 
 78 |     if "operator" in source_ast.type or "operator" in target_ast.type:
 79 |         if is_unary_operator_change(source_ast, target_ast):
 80 |             return SStubPattern.CHANGE_UNARY_OPERATOR
 81 | 
 82 |     # Now run all classifier functions
 83 |     for classifier_fn in classifier_fns:
 84 |         result = classifier_fn(source_ast, target_ast)
 85 | 
 86 |         if result != SStubPattern.SINGLE_STMT:
 87 |             return result
 88 | 
 89 |     if is_binary_operand(source_ast, target_ast):
 90 |         return SStubPattern.CHANGE_BINARY_OPERAND
 91 | 
 92 |     return SStubPattern.SINGLE_STMT
 93 | 
 94 | 
 95 | # Utils -------------------------------------------------------------------------
 96 | 
 97 | def _call_name(ast_node):
 98 |     function_node = ast_node.children[0]
 99 | 
100 |     right_most = function_node
101 |     while len(right_most.children) > 0:
102 |         right_most = right_most.children[-1]
103 |     
104 |     return right_most.text
105 | 
106 | 
107 | 
108 | def pisomorph(A, B):
109 |     if A.isomorph(B): return True
110 | 
111 |     if A.type == "parenthesized_expression":
112 |         return pisomorph(A.children[1], B)
113 |     
114 |     if B.type == "parenthesized_expression":
115 |         return pisomorph(A, B.children[1])
116 |     
117 |     return False
118 | 
119 |     
120 | 
121 | # Binary operand ----------------------------------------------------------------
122 | 
123 | 
124 | def is_binary_operand(source_ast, target_ast):
125 | 
126 |     for bin_op_type in ["binary_operator", "comparison_operator", "boolean_operator"]:
127 |         for direction in ["left", "right"]:
128 |             if (_query_path(source_ast, bin_op_type, direction, depth = 1)):
129 |                 return True
130 |     
131 |     return False
132 | 
133 | 
134 | 
135 | # Single token edits --------------------------------
136 | 
137 | def _query_path(ast_node, type_query, edge_query = "*", depth = 1e9):
138 | 
139 |     last    = None
140 |     current = ast_node
141 |     while current is not None:
142 | 
143 |         if current.type == type_query:
144 |             
145 |             if edge_query == "*":
146 |                 return True
147 |             elif last is not None:
148 |                 if hasattr(current, "backend"):
149 |                     edge_child = current.backend.child_by_field_name(edge_query)
150 |                     return edge_child == last.backend
151 | 
152 |         last    = current
153 |         current = current.parent
154 |         depth  -= 1
155 |         if depth < 0: break
156 |     
157 |     return False
158 | 
159 | 
160 | 
161 | def _get_parent(ast_node, type_query, edge_query = "*", depth = 1e9):
162 | 
163 |     last    = None
164 |     current = ast_node
165 |     while current is not None:
166 | 
167 |         if current.type == type_query:
168 |             
169 |             if edge_query == "*":
170 |                 return current
171 |             elif last is not None:
172 |                 if hasattr(current, "backend"):
173 |                     edge_child = current.backend.child_by_field_name(edge_query)
174 |                     if edge_child == last.backend:
175 |                         return current
176 | 
177 |         last    = current
178 |         current = current.parent
179 |         depth  -= 1
180 |         if depth < 0: break
181 |     
182 |     return None
183 | 
184 | 
185 | 
186 | def wrong_function_name(source_ast, target_ast):
187 |     if not source_ast.type == "identifier": return False
188 |     if not target_ast.type == "identifier": return False
189 | 
190 |     func_call = _get_parent(source_ast, "call", "function")
191 |     if func_call is None: return False
192 |     
193 |     right_most = func_call.backend.child_by_field_name("function")
194 |     while right_most is not None and right_most != source_ast.backend:
195 |         if len(right_most.children) > 0:
196 |             right_most = right_most.children[-1]
197 |         else:
198 |             right_most = None
199 | 
200 |     return right_most is not None
201 | 
202 | 
203 | def change_numeric_literal(source_ast, target_ast):
204 |     return source_ast.type in ["integer", "float"] and target_ast.type in ["integer", "float"]
205 | 
206 | 
207 | def change_string_literal(source_ast, target_ast):
208 |     return source_ast.type == "string" and target_ast.type == "string"
209 | 
210 | 
211 | def change_boolean_literal(source_ast, target_ast):
212 |     return source_ast.type in ["false", "true"] and target_ast.type in ["false", "true"]
213 | 
214 | 
215 | def change_attribute_used(source_ast, target_ast):
216 |     if source_ast.type == "identifier":
217 |         return _query_path(source_ast, "attribute", "attribute", depth = 1)
218 |     return False
219 | 
220 | 
221 | def change_identifier_used(source_ast, target_ast):
222 | 
223 |     # Following ManySStuBs we ignore the following Method declaration, Class Declaration, Variable Declaration
224 |     if any(x in source_ast.parent.type for x in ["definition", "declaration"]):
225 |         return False
226 | 
227 |     return source_ast.type == "identifier" and target_ast.type == "identifier"
228 | 
229 | 
230 | def change_binary_operator(source_ast, target_ast):
231 | 
232 |     if source_ast.parent.type in ["binary_operator", "boolean_operator", "comparison_operator"]:
233 |         bin_op = source_ast.parent
234 |         return bin_op.children[1] == source_ast
235 | 
236 |     return False
237 | 
238 | 
239 | def _to_plain_constant(text):
240 |     
241 |     if "\'" in text: text = text[1:-1]
242 |     if "\"" in text: text = text[1:-1]
243 | 
244 |     try:
245 |         return float(text)
246 |     except:
247 |         try:
248 |             return float(int(text))
249 |         except:
250 |             return text
251 | 
252 | 
253 | def change_constant_type(source_ast, target_ast):
254 | 
255 |     if source_ast.type == "identifier": return False
256 |     if target_ast.type == "identifier": return False
257 | 
258 |     if source_ast.type == target_ast.type: return False
259 | 
260 |     source_text = _to_plain_constant(source_ast.text)
261 |     target_text = _to_plain_constant(target_ast.text)
262 | 
263 |     return source_text == target_text
264 | 
265 | 
266 | def change_keyword_argument_used(source_ast, target_ast):
267 |     if source_ast.type == "identifier":
268 |         return _query_path(source_ast, "keyword_argument", "name", depth = 1)
269 |     return False
270 | 
271 | 
272 | def same_function_wrong_caller(source_ast, target_ast):
273 |     if not source_ast.type == "identifier": return False
274 | 
275 |     if not _query_path(source_ast, "call", "function", depth = 2): return False
276 | 
277 |     return _query_path(source_ast, "attribute", "object", depth = 1)
278 | 
279 | 
280 | 
281 | single_token_edits = {
282 |     SStubPattern.WRONG_FUNCTION_NAME: wrong_function_name,
283 |     SStubPattern.CHANGE_CONSTANT_TYPE: change_constant_type,
284 |     SStubPattern.CHANGE_NUMERIC_LITERAL: change_numeric_literal,
285 |     SStubPattern.CHANGE_BOOLEAN_LITERAL: change_boolean_literal,
286 |     SStubPattern.CHANGE_ATTRIBUTE_USED: change_attribute_used,
287 |     SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED : change_keyword_argument_used,
288 |     SStubPattern.SAME_FUNCTION_WRONG_CALLER: same_function_wrong_caller,
289 |     SStubPattern.CHANGE_BINARY_OPERATOR: change_binary_operator,
290 |     SStubPattern.CHANGE_BINARY_OPERAND:  is_binary_operand,
291 |     SStubPattern.CHANGE_IDENTIFIER_USED: change_identifier_used,
292 |     SStubPattern.CHANGE_STRING_LITERAL: change_string_literal,
293 | }
294 | 
295 | 
296 | def single_token_edit(source_ast, target_ast):
297 |     
298 |     for key, test_fn in single_token_edits.items():
299 |         if test_fn(source_ast, target_ast):
300 |             return key
301 |         
302 |     return SStubPattern.SINGLE_TOKEN
303 | 
304 | 
305 | # Same function --------------------------------
306 | 
307 | 
308 | def same_function_more_args(source_ast, target_ast):
309 |     
310 |     if len(source_ast.children) >= len(target_ast.children):
311 |         return False
312 | 
313 |     arguments = source_ast.children
314 |     for arg in arguments:
315 |         if not any(pisomorph(t, arg) for t in target_ast.children):
316 |             return False
317 |         
318 |     return True
319 | 
320 | 
321 | def same_function_less_args(source_ast, target_ast):
322 |     
323 |     if len(source_ast.children) <= len(target_ast.children):
324 |         return False
325 | 
326 |     arguments = target_ast.children
327 |     for arg in arguments:
328 |         if not any(pisomorph(t, arg) for t in source_ast.children):
329 |             return False
330 |         
331 |     return True
332 | 
333 | 
334 | def same_function_swap_args(source_ast, target_ast):
335 | 
336 |     if len(source_ast.children) != len(target_ast.children):
337 |         return False
338 | 
339 |     src_arguments    = source_ast.children
340 |     target_arguments = target_ast.children
341 | 
342 |     diff_args = [i for i, src_arg in enumerate(src_arguments) if not pisomorph(src_arg, target_arguments[i])]
343 | 
344 |     if len(diff_args) != 2: return False
345 | 
346 |     swap_0, swap_1 = diff_args
347 |     return (pisomorph(src_arguments[swap_0], target_arguments[swap_1])
348 |              and pisomorph(src_arguments[swap_1], target_arguments[swap_0]))
349 | 
350 | 
351 | same_function_edits = {
352 |     SStubPattern.SAME_FUNCTION_MORE_ARGS: same_function_more_args,
353 |     SStubPattern.SAME_FUNCTION_LESS_ARGS: same_function_less_args,
354 |     SStubPattern.SAME_FUNCTION_SWAP_ARGS: same_function_swap_args,
355 | }
356 | 
357 | 
358 | def same_function_mod(source_ast, target_ast):
359 |     
360 |     if source_ast.type != "argument_list" or target_ast.type != "argument_list":
361 |         return SStubPattern.SINGLE_STMT
362 | 
363 |     for key, test_fn in same_function_edits.items():
364 |         if test_fn(source_ast, target_ast):
365 |             return key
366 |         
367 |     return SStubPattern.SINGLE_STMT
368 | 
369 | 
370 | 
371 | # If statement ----------------------------------------------------------------
372 | 
373 | 
374 | def more_specific_if(source_ast, target_ast):
375 |     
376 |     if not target_ast.type == "boolean_operator": return False
377 |     if target_ast.children[1].type != "and"     : return False
378 | 
379 |     return any(pisomorph(c, source_ast) for c in target_ast.children)
380 | 
381 | 
382 | def less_specific_if(source_ast, target_ast):
383 |     if not target_ast.type == "boolean_operator": return False
384 |     if target_ast.children[1].type != "or"      : return False
385 | 
386 |     return any(pisomorph(c, source_ast) for c in target_ast.children)
387 | 
388 | 
389 | def change_if_statement(source_ast, target_ast):
390 |     
391 |     if more_specific_if(source_ast, target_ast):
392 |         return SStubPattern.MORE_SPECIFIC_IF
393 | 
394 |     if less_specific_if(source_ast, target_ast):
395 |         return SStubPattern.LESS_SPECIFIC_IF
396 | 
397 |     return SStubPattern.SINGLE_STMT
398 | 
399 | # Change iterable ----------------------------------------------------------------
400 | 
401 | def add_elements_to_iterable(source_ast, target_ast):
402 |     
403 |     if len(source_ast.children) >= len(target_ast.children):
404 |         return False
405 | 
406 |     for c in source_ast.children:
407 |         if not any(pisomorph(t, c) for t in target_ast.children):
408 |             return False
409 |         
410 |     return True
411 | 
412 | 
413 | def change_iterable(source_ast, target_ast):
414 |     
415 |     if add_elements_to_iterable(source_ast, target_ast):
416 |         return SStubPattern.ADD_ELEMENTS_TO_ITERABLE
417 | 
418 |     return SStubPattern.SINGLE_STMT
419 | 
420 | 
421 | # ADD CALL AROUND STATEMENT ----------------------------------------------------------------
422 | 
423 | def add_function_around_expression(source_ast, target_ast):
424 |     if len(target_ast.children) == 0: return False
425 | 
426 |     argument_list = target_ast.children[-1]
427 |     
428 |     if argument_list.type != "argument_list":
429 |         return False
430 | 
431 |     # It seems that adding arguments together with a function seems to be okay (see PySStuBs dataset)
432 |     #if len(argument_list.children) != 3: return False 
433 | 
434 |     for arg in argument_list.children:
435 |         if pisomorph(arg, source_ast):
436 |             return True
437 | 
438 |     return False
439 | 
440 | 
441 | def add_function(source_ast, target_ast):
442 | 
443 |     if add_function_around_expression(source_ast, target_ast):
444 |         return SStubPattern.ADD_FUNCTION_AROUND_EXPRESSION
445 | 
446 |     if add_method_call(source_ast, target_ast):
447 |         return SStubPattern.ADD_METHOD_CALL
448 | 
449 |     return SStubPattern.SINGLE_STMT
450 | 
451 | 
452 | # ADD METHOD ----------------------------------------------------------------
453 | 
454 | def add_method_call(source_ast, target_ast):
455 |     if len(target_ast.children) == 0: return False
456 |     
457 |     attribute = target_ast.children[0]
458 | 
459 |     if attribute.type not in ["attribute", "call"]: return False
460 | 
461 |     return pisomorph(attribute.children[0], source_ast)
462 | 
463 | # ADD attribute -------------------------------------------------------------
464 | 
465 | 
466 | def add_attribute_access(source_ast, target_ast):
467 |     if pisomorph(target_ast.children[0], source_ast):
468 |         return SStubPattern.ADD_ATTRIBUTE_ACCESS
469 |     
470 |     return SStubPattern.SINGLE_STMT
471 | 
472 | 
473 | # Change unary operator ----------------------------------------------------
474 | 
475 | def is_unary_operator(node):
476 |     if "operator" not in node.type: return False
477 |     return len(node.children) == 2
478 | 
479 | 
480 | def is_unary_operator_change(source_ast, target_ast):
481 | 
482 |     if is_unary_operator(source_ast):
483 |         for source_child in source_ast.children:
484 |             if pisomorph(source_child, target_ast): return True
485 |     
486 |     if is_unary_operator(target_ast):
487 |         for target_child in target_ast.children:
488 |             if pisomorph(target_child, source_ast): return True
489 | 
490 |     return False
491 |     


--------------------------------------------------------------------------------
/code_diff/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def cached_property(fnc):
 3 |     name = fnc.__name__
 4 | 
 5 |     def load_from_cache(self):
 6 |         if not hasattr(self, "_cache"): self._cache = {}
 7 | 
 8 |         if name not in self._cache:
 9 |             self._cache[name] = fnc(self)
10 |         
11 |         return self._cache[name]
12 |     
13 |     return property(load_from_cache)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"] 
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "code_diff"
 7 | version = "v0.1.3"
 8 | description = "Fast AST based code differencing in Python"
 9 | readme = "README.md"
10 | requires-python = ">= 3.8"
11 | license = { file = "LICENSE.txt" }
12 | keywords = ["code", "differencing", "AST", "CST", "program", "language processing"]
13 | 
14 | authors = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
15 | maintainers = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
16 | 
17 | classifiers = [
18 |     "Development Status :: 3 - Alpha",    
19 |     "Intended Audience :: Developers",  
20 |     "Topic :: Software Development :: Build Tools",
21 |     "License :: OSI Approved :: MIT License",
22 |     "Programming Language :: Python :: 3", 
23 |     "Programming Language :: Python :: 3.6",
24 |     "Programming Language :: Python :: 3.7",
25 |     "Programming Language :: Python :: 3.8",
26 |     "Programming Language :: Python :: 3.9",
27 |     "Programming Language :: Python :: 3.10",
28 |     "Programming Language :: Python :: 3.11",
29 |     "Programming Language :: Python :: 3.12",
30 |     "Programming Language :: Python :: 3.13",
31 |     "Programming Language :: Python :: 3 :: Only",
32 |   ]
33 | 
34 | dependencies = ["code_tokenize", "apted"]
35 | 
36 | [project.urls]
37 | "Homepage" = "https://github.com/cedricrupb/code_diff"
38 | "Bug Reports" = "https://github.com/cedricrupb/code_diff/issues"
39 | "Source" = "https://github.com/cedricrupb/code_diff"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | code-tokenize >= 0.1.0
2 | apted >= 1.0.3


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | ong_description_content_type = text/markdown


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |   name = 'code_diff',
 8 |   packages = ['code_diff', 'code_diff.gumtree'], 
 9 |   version = '0.1.3', 
10 |   license='MIT',     
11 |   description = 'Fast AST based code differencing in Python',
12 |   long_description = long_description,
13 |   long_description_content_type="text/markdown",
14 |   author = 'Cedric Richter',                   
15 |   author_email = 'cedricr.upb@gmail.com',    
16 |   url = 'https://github.com/cedricrupb/code_diff',  
17 |   download_url = 'https://github.com/cedricrupb/code_diff/archive/refs/tags/v0.1.3.tar.gz', 
18 |   keywords = ['code', 'differencing', 'AST', 'program', 'language processing'], 
19 |   install_requires=[          
20 |           'code-tokenize>=0.2.1',
21 |           'apted'
22 |       ],
23 |   classifiers=[
24 |     'Development Status :: 3 - Alpha',    
25 |     'Intended Audience :: Developers',  
26 |     'Topic :: Software Development :: Build Tools',
27 |     'License :: OSI Approved :: MIT License',
28 |     'Programming Language :: Python :: 3', 
29 |     'Programming Language :: Python :: 3.6',
30 |     'Programming Language :: Python :: 3.7',
31 |     'Programming Language :: Python :: 3.8',
32 |     'Programming Language :: Python :: 3.9',
33 |     'Programming Language :: Python :: 3.10',
34 |     'Programming Language :: Python :: 3.11',
35 |     'Programming Language :: Python :: 3.12',
36 |     'Programming Language :: Python :: 3.13',
37 |   ],
38 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_diff/e8c6a8fdc953d8e6db414d31c4ca90bd5ceaf2b4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_sstubs.py:
--------------------------------------------------------------------------------
   1 | import code_diff as cd
   2 | 
   3 | from code_diff.diff_utils import parse_hunks
   4 | from code_diff import SStubPattern
   5 | 
   6 | # Util --------------------------------------------------------------
   7 | 
   8 | def compute_diff_sstub(diff):
   9 |     hunks = parse_hunks(diff)
  10 |     hunk  = hunks[0]
  11 |     diff  = cd.difference(hunk.before, hunk.after, lang = "python")
  12 |     return diff.sstub_pattern()
  13 | 
  14 | 
  15 | 
  16 | # Wrong Function name ----------------------------------------------
  17 | 
  18 | def test_wrong_function_name_1():
  19 | 
  20 |     test = """
  21 | @@ -0,0 +0,0 @@ test
  22 |     
  23 | - test()
  24 | + test2()
  25 |     
  26 |     """
  27 |     
  28 |     assert compute_diff_sstub(test) == SStubPattern.WRONG_FUNCTION_NAME
  29 | 
  30 | 
  31 | def test_wrong_function_name_2():
  32 | 
  33 |     test = """
  34 | @@ -0,0 +0,0 @@ test
  35 |     
  36 | - test.call()
  37 | + test.call_async()
  38 |     
  39 |     """
  40 |     
  41 |     assert compute_diff_sstub(test) == SStubPattern.WRONG_FUNCTION_NAME
  42 | 
  43 | 
  44 | def test_wrong_function_name_3():
  45 | 
  46 |     test = """
  47 | @@ -0,0 +0,0 @@ test
  48 |     
  49 | - test.call_async('Hello World', x, x / 2)
  50 | + test.call('Hello World', x, x / 2)
  51 |     
  52 |     """
  53 |     
  54 |     assert compute_diff_sstub(test) == SStubPattern.WRONG_FUNCTION_NAME
  55 | 
  56 | 
  57 | def test_wrong_function_name_4():
  58 | 
  59 |     test = """
  60 | @@ -0,0 +0,0 @@ test
  61 |     
  62 | - test_call.call('Hello World', x, x / 2)
  63 | + test.call('Hello World', x, x / 2)
  64 |     
  65 |     """
  66 |     
  67 |     assert compute_diff_sstub(test) != SStubPattern.WRONG_FUNCTION_NAME
  68 | 
  69 | 
  70 | def test_wrong_function_name_5():
  71 | 
  72 |     test = """
  73 | @@ -0,0 +0,0 @@ test
  74 |     
  75 | - test.x.call('Hello World', x, x / 2)
  76 | + test.y.call('Hello World', x, x / 2)
  77 |     
  78 |     """
  79 |     
  80 |     assert compute_diff_sstub(test) != SStubPattern.WRONG_FUNCTION_NAME
  81 | 
  82 | 
  83 | 
  84 | # Same Function more args -------------------------------------------
  85 | 
  86 | def test_same_function_more_args_1():
  87 | 
  88 |     test = """
  89 | @@ -0,0 +0,0 @@ test
  90 |     
  91 | - test()
  92 | + test(x)
  93 |     
  94 |     """
  95 |     
  96 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_MORE_ARGS
  97 | 
  98 | 
  99 | def test_same_function_more_args_2():
 100 | 
 101 |     test = """
 102 | @@ -0,0 +0,0 @@ test
 103 |     
 104 | - test(x)
 105 | + test(x, y)
 106 |     
 107 |     """
 108 |     
 109 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_MORE_ARGS
 110 | 
 111 | 
 112 | def test_same_function_more_args_3():
 113 | 
 114 |     test = """
 115 | @@ -0,0 +0,0 @@ test
 116 |     
 117 | - test(x, y)
 118 | + test(x, y + 1)
 119 |     
 120 |     """
 121 |     
 122 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_MORE_ARGS
 123 | 
 124 | 
 125 | def test_same_function_more_args_4():
 126 | 
 127 |     test = """
 128 | @@ -0,0 +0,0 @@ test
 129 |     
 130 | - test(x)
 131 | + test(x, y + 1)
 132 |     
 133 |     """
 134 |     
 135 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_MORE_ARGS
 136 | 
 137 | 
 138 | def test_same_function_more_args_5():
 139 | 
 140 |     test = """
 141 | @@ -0,0 +0,0 @@ test
 142 |     
 143 | - test(x + 1)
 144 | + test(x, y + 1)
 145 |     
 146 |     """
 147 |     
 148 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_MORE_ARGS
 149 | 
 150 | 
 151 | def test_same_function_more_args_6():
 152 | 
 153 |     test = """
 154 | @@ -0,0 +0,0 @@ test
 155 |     
 156 | - test.call(x)
 157 | + test.call(x, y)
 158 |     
 159 |     """
 160 |     
 161 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_MORE_ARGS
 162 | 
 163 | 
 164 | def test_same_function_more_args_7():
 165 | 
 166 |     test = """
 167 | @@ -0,0 +0,0 @@ test
 168 |     
 169 | - test.call(x)
 170 | + test.call(x, y, z, d, a, call())
 171 |     
 172 |     """
 173 |     
 174 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_MORE_ARGS
 175 | 
 176 | 
 177 | def test_same_function_more_args_8():
 178 | 
 179 |     test = """
 180 | @@ -0,0 +0,0 @@ test
 181 |     
 182 | - test.call1(x)
 183 | + test.call(x, y, z, d, a, call())
 184 |     
 185 |     """
 186 |     
 187 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_MORE_ARGS
 188 | 
 189 | # Same Function less args -------------------------------------------
 190 | 
 191 | def test_same_function_less_args_1():
 192 | 
 193 |     test = """
 194 | @@ -0,0 +0,0 @@ test
 195 |     
 196 | - test(x)
 197 | + test()
 198 |     
 199 |     """
 200 |     
 201 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_LESS_ARGS
 202 | 
 203 | 
 204 | def test_same_function_less_args_2():
 205 | 
 206 |     test = """
 207 | @@ -0,0 +0,0 @@ test
 208 |     
 209 | - test(x, y)
 210 | + test(x)
 211 |     
 212 |     """
 213 |     
 214 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_LESS_ARGS
 215 | 
 216 | 
 217 | 
 218 | def test_same_function_less_args_3():
 219 | 
 220 |     test = """
 221 | @@ -0,0 +0,0 @@ test
 222 |     
 223 | - test(x, y + 1)
 224 | + test(x, y)
 225 |     
 226 |     """
 227 |     
 228 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_LESS_ARGS
 229 | 
 230 | 
 231 | def test_same_function_less_args_4():
 232 | 
 233 |     test = """
 234 | @@ -0,0 +0,0 @@ test
 235 |     
 236 | - test(x, y + 1)
 237 | + test(x)
 238 |     
 239 |     """
 240 |     
 241 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_LESS_ARGS
 242 | 
 243 | 
 244 | def test_same_function_less_args_5():
 245 | 
 246 |     test = """
 247 | @@ -0,0 +0,0 @@ test
 248 |     
 249 | - test(x, y + 1)
 250 | + test(x + 1)
 251 |     
 252 |     """
 253 |     
 254 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_LESS_ARGS
 255 | 
 256 | 
 257 | def test_same_function_less_args_6():
 258 | 
 259 |     test = """
 260 | @@ -0,0 +0,0 @@ test
 261 |     
 262 | - test.call(x, y)
 263 | + test.call(x)
 264 |     
 265 |     """
 266 |     
 267 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_LESS_ARGS
 268 | 
 269 | 
 270 | def test_same_function_less_args_7():
 271 | 
 272 |     test = """
 273 | @@ -0,0 +0,0 @@ test
 274 |     
 275 | - test.call(x, y, z, d, a, call())
 276 | + test.call(x)
 277 |     
 278 |     """
 279 |     
 280 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_LESS_ARGS
 281 | 
 282 | # Same Function wrong caller -------------------------------------------
 283 | 
 284 | def test_same_function_wrong_caller_1():
 285 | 
 286 |     test = """
 287 | @@ -0,0 +0,0 @@ test
 288 |     
 289 | - test.call()
 290 | + test1.call()
 291 |     
 292 |     """
 293 |     
 294 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_WRONG_CALLER
 295 | 
 296 | 
 297 | def test_same_function_wrong_caller_2():
 298 | 
 299 |     test = """
 300 | @@ -0,0 +0,0 @@ test
 301 |     
 302 | - test.x.call()
 303 | + test.y.call()
 304 |     
 305 |     """
 306 |     
 307 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_WRONG_CALLER
 308 | 
 309 | 
 310 | def test_same_function_wrong_caller_3():
 311 | 
 312 |     test = """
 313 | @@ -0,0 +0,0 @@ test
 314 |     
 315 | - call()
 316 | + test.call()
 317 |     
 318 |     """
 319 |     
 320 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_WRONG_CALLER
 321 | 
 322 | 
 323 | # Same Function swap args -------------------------------------------
 324 | 
 325 | def test_same_function_swap_args_1():
 326 | 
 327 |     test = """
 328 | @@ -0,0 +0,0 @@ test
 329 |     
 330 | - test.call(x, y)
 331 | + test.call(y, x)
 332 |     
 333 |     """
 334 |     
 335 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_SWAP_ARGS
 336 | 
 337 | 
 338 | def test_same_function_swap_args_2():
 339 | 
 340 |     test = """
 341 | @@ -0,0 +0,0 @@ test
 342 |     
 343 | - test.call1(x, y)
 344 | + test.call(y, x)
 345 |     
 346 |     """
 347 |     
 348 |     assert compute_diff_sstub(test) != SStubPattern.SAME_FUNCTION_SWAP_ARGS
 349 | 
 350 | 
 351 | def test_same_function_swap_args_3():
 352 | 
 353 |     test = """
 354 | @@ -0,0 +0,0 @@ test
 355 |     
 356 | - test.call(x, y, z)
 357 | + test.call(y, x, z)
 358 |     
 359 |     """
 360 |     
 361 |     assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_SWAP_ARGS
 362 | 
 363 | 
 364 | def bin_swaps(x):
 365 |     for i in range(len(x) - 1):
 366 |         for j in range(i + 1, len(x)):
 367 |             result = list(x)
 368 |             result[i], result[j] = result[j], result[i]
 369 |             yield result
 370 | 
 371 | 
 372 | def test_same_function_swap_args_auto():
 373 | 
 374 |     args = ["a", "b", "c", "d + 1", "0 if a != 0 else 1"]
 375 | 
 376 |     for l in range(2, len(args)):
 377 |         perm = tuple(args[:l])
 378 | 
 379 |         for p in bin_swaps(perm):
 380 | 
 381 |             test = """
 382 | @@ -0,0 +0,0 @@ test
 383 |             
 384 | - test.call(%s)
 385 | + test.call(%s)
 386 |             
 387 |             """ % (", ".join(perm), ", ".join(p))
 388 |             
 389 |             assert compute_diff_sstub(test) == SStubPattern.SAME_FUNCTION_SWAP_ARGS
 390 | 
 391 | 
 392 | # Add function around expression -------------------------------------------
 393 | 
 394 | def test_add_function_around_expression_1():
 395 | 
 396 |     test = """
 397 | @@ -0,0 +0,0 @@ test
 398 |     
 399 | - result = x
 400 | + result = int(x)
 401 |     
 402 |     """
 403 |     
 404 |     assert compute_diff_sstub(test) == SStubPattern.ADD_FUNCTION_AROUND_EXPRESSION
 405 | 
 406 | 
 407 | def test_add_function_around_expression_2():
 408 | 
 409 |     test = """
 410 | @@ -0,0 +0,0 @@ test
 411 |     
 412 | - result = x + 1
 413 | + result = int(x) + 1
 414 |     
 415 |     """
 416 |     
 417 |     assert compute_diff_sstub(test) == SStubPattern.ADD_FUNCTION_AROUND_EXPRESSION
 418 | 
 419 | 
 420 | def test_add_function_around_expression_3():
 421 | 
 422 |     test = """
 423 | @@ -0,0 +0,0 @@ test
 424 |     
 425 | - result = x + 1
 426 | + result = int(x + 1)
 427 |     
 428 |     """
 429 |     
 430 |     assert compute_diff_sstub(test) == SStubPattern.ADD_FUNCTION_AROUND_EXPRESSION
 431 | 
 432 | # Add method call --------------------------------------------------------
 433 | 
 434 | 
 435 | def test_add_method_call_1():
 436 | 
 437 |     test = """
 438 | @@ -0,0 +0,0 @@ test
 439 |     
 440 | - result = x
 441 | + result = x.get()
 442 |     
 443 |     """
 444 |     
 445 |     assert compute_diff_sstub(test) == SStubPattern.ADD_METHOD_CALL
 446 | 
 447 | 
 448 | def test_add_method_call_2():
 449 | 
 450 |     test = """
 451 | @@ -0,0 +0,0 @@ test
 452 |     
 453 | - result = x.get()
 454 | + result = x.get().return()
 455 |     
 456 |     """
 457 |     
 458 |     assert compute_diff_sstub(test) == SStubPattern.ADD_METHOD_CALL
 459 | 
 460 | 
 461 | def test_add_method_call_3():
 462 | 
 463 |     test = """
 464 | @@ -0,0 +0,0 @@ test
 465 |     
 466 | - result = x.y
 467 | + result = x.y.get()
 468 |     
 469 |     """
 470 |     
 471 |     assert compute_diff_sstub(test) == SStubPattern.ADD_METHOD_CALL
 472 | 
 473 | 
 474 | def test_add_method_call_4():
 475 | 
 476 |     test = """
 477 | @@ -0,0 +0,0 @@ test
 478 |     
 479 | - result = x.get()
 480 | + result = x.return().get()
 481 |     
 482 |     """
 483 |     
 484 |     assert compute_diff_sstub(test) == SStubPattern.ADD_METHOD_CALL
 485 | 
 486 | 
 487 | def test_add_method_call_5():
 488 | 
 489 |     test = """
 490 | @@ -0,0 +0,0 @@ test
 491 |     
 492 | - result = x.get()
 493 | + result = x.return.get()
 494 |     
 495 |     """
 496 |     
 497 |     assert compute_diff_sstub(test) != SStubPattern.ADD_METHOD_CALL
 498 | 
 499 | 
 500 | 
 501 | def test_add_method_call_6():
 502 | 
 503 |     test = """
 504 | @@ -0,0 +0,0 @@ test
 505 |     
 506 | - result = x.return().get()
 507 | + result = x.get()
 508 |     
 509 |     """
 510 |     
 511 |     assert compute_diff_sstub(test) != SStubPattern.ADD_METHOD_CALL
 512 | 
 513 | # Change identifier --------------------------------------------------------
 514 | 
 515 | def test_change_identifier_used_1():
 516 | 
 517 |     test = """
 518 | @@ -0,0 +0,0 @@ test
 519 |     
 520 | - result = x
 521 | + result = y
 522 |     
 523 |     """
 524 |     
 525 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_IDENTIFIER_USED
 526 | 
 527 | 
 528 | def test_change_identifier_used_2():
 529 | 
 530 |     test = """
 531 | @@ -0,0 +0,0 @@ test
 532 |     
 533 | - result = test(path = path)
 534 | + result = test(path = path2)
 535 |     
 536 |     """
 537 |     
 538 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_IDENTIFIER_USED
 539 | 
 540 | 
 541 | def test_change_identifier_used_2():
 542 | 
 543 |     test = """
 544 | @@ -0,0 +0,0 @@ test
 545 |     
 546 | - result = test(path = path)
 547 | + result = test(path2 = path)
 548 |     
 549 |     """
 550 |     
 551 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_IDENTIFIER_USED
 552 | 
 553 | 
 554 | def test_change_identifier_used_3():
 555 | 
 556 |     test = """
 557 | @@ -0,0 +0,0 @@ test
 558 |     
 559 | - result = test(path = path)
 560 | + result = test2(path = path)
 561 |     
 562 |     """
 563 |     
 564 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_IDENTIFIER_USED
 565 | 
 566 | 
 567 | def test_change_identifier_used_4():
 568 | 
 569 |     test = """
 570 | @@ -0,0 +0,0 @@ test
 571 |     
 572 | - result = test.x(a, b, c)
 573 | + result = test1.x(a, b, c)
 574 |     
 575 |     """
 576 |     
 577 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_IDENTIFIER_USED
 578 | 
 579 | 
 580 | def test_change_identifier_used_5():
 581 | 
 582 |     test = """
 583 | @@ -0,0 +0,0 @@ test
 584 |     
 585 | - result = test.x(a, b, c)
 586 | + result1 = test.x(a, b, c)
 587 |     
 588 |     """
 589 |     
 590 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_IDENTIFIER_USED
 591 | 
 592 | 
 593 | # Change numeric literal ----------------------------------------------------
 594 | 
 595 | def test_change_numeric_literal_1():
 596 | 
 597 |     test = """
 598 | @@ -0,0 +0,0 @@ test
 599 |     
 600 | - result = 0
 601 | + result = 1
 602 |     
 603 |     """
 604 |     
 605 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_NUMERIC_LITERAL
 606 | 
 607 | 
 608 | def test_change_numeric_literal_2():
 609 | 
 610 |     test = """
 611 | @@ -0,0 +0,0 @@ test
 612 |     
 613 | - result = x + 1
 614 | + result = x + 5
 615 |     
 616 |     """
 617 |     
 618 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_NUMERIC_LITERAL
 619 | 
 620 | 
 621 | def test_change_numeric_literal_3():
 622 | 
 623 |     test = """
 624 | @@ -0,0 +0,0 @@ test
 625 |     
 626 | - result = x + 1
 627 | + result = x + 5.0
 628 |     
 629 |     """
 630 |     
 631 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_NUMERIC_LITERAL
 632 | 
 633 | 
 634 | def test_change_numeric_literal_4():
 635 | 
 636 |     test = """
 637 | @@ -0,0 +0,0 @@ test
 638 |     
 639 | - result = x + 1
 640 | + result = x + 1.0
 641 |     
 642 |     """
 643 |     
 644 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_NUMERIC_LITERAL
 645 | 
 646 | 
 647 | def test_change_numeric_literal_5():
 648 | 
 649 |     test = """
 650 | @@ -0,0 +0,0 @@ test
 651 |     
 652 | - result = x + 1
 653 | + result = x + a
 654 |     
 655 |     """
 656 |     
 657 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_NUMERIC_LITERAL
 658 | 
 659 | 
 660 | # Change boolean literal ----------------------------------------------------
 661 | 
 662 | def test_change_boolean_literal_1():
 663 | 
 664 |     test = """
 665 | @@ -0,0 +0,0 @@ test
 666 |     
 667 | - if True:
 668 | + if False:
 669 |     pass
 670 |     
 671 |     """
 672 |     
 673 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BOOLEAN_LITERAL
 674 | 
 675 | 
 676 | def test_change_boolean_literal_2():
 677 | 
 678 |     test = """
 679 | @@ -0,0 +0,0 @@ test
 680 |     
 681 | - if True and x < 0:
 682 | + if False and x < 0:
 683 |     pass
 684 |     
 685 |     """
 686 |     
 687 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BOOLEAN_LITERAL
 688 | 
 689 | 
 690 | def test_change_boolean_literal_3():
 691 | 
 692 |     test = """
 693 | @@ -0,0 +0,0 @@ test
 694 |     
 695 | - if False and x < 0:
 696 | + if True and x < 0:
 697 |     pass
 698 |     
 699 |     """
 700 |     
 701 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BOOLEAN_LITERAL
 702 | 
 703 | 
 704 | def test_change_boolean_literal_4():
 705 | 
 706 |     test = """
 707 | @@ -0,0 +0,0 @@ test
 708 |     
 709 | - if False and x < 0:
 710 | + if x / 2 == 0 and x < 0:
 711 |     pass
 712 |     
 713 |     """
 714 |     
 715 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_BOOLEAN_LITERAL
 716 | 
 717 | # Change unary operator ----------------------------------------------------
 718 | 
 719 | def test_change_unary_operator_1():
 720 | 
 721 |     test = """
 722 | @@ -0,0 +0,0 @@ test
 723 |     
 724 | - if x:
 725 | + if not x:
 726 |     pass
 727 |     
 728 |     """
 729 |     
 730 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 731 | 
 732 | 
 733 | def test_change_unary_operator_2():
 734 | 
 735 |     test = """
 736 | @@ -0,0 +0,0 @@ test
 737 |     
 738 | - result = x
 739 | + result = -x
 740 |     pass
 741 |     
 742 |     """
 743 |     
 744 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 745 | 
 746 | 
 747 | def test_change_unary_operator_3():
 748 | 
 749 |     test = """
 750 | @@ -0,0 +0,0 @@ test
 751 |     
 752 | - result = x
 753 | + result = +x
 754 |     pass
 755 |     
 756 |     """
 757 |     
 758 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 759 | 
 760 | 
 761 | def test_change_unary_operator_4():
 762 | 
 763 |     test = """
 764 | @@ -0,0 +0,0 @@ test
 765 |     
 766 | - if not x:
 767 | + if x:
 768 |     pass
 769 |     
 770 |     """
 771 |     
 772 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 773 | 
 774 | 
 775 | def test_change_unary_operator_5():
 776 | 
 777 |     test = """
 778 | @@ -0,0 +0,0 @@ test
 779 |     
 780 | - result = -x
 781 | + result = x
 782 |     pass
 783 |     
 784 |     """
 785 |     
 786 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 787 | 
 788 | 
 789 | def test_change_unary_operator_6():
 790 | 
 791 |     test = """
 792 | @@ -0,0 +0,0 @@ test
 793 |     
 794 | - result = +x
 795 | + result = x
 796 |     pass
 797 |     
 798 |     """
 799 |     
 800 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 801 | 
 802 | 
 803 | def test_change_unary_operator_7():
 804 | 
 805 |     test = """
 806 | @@ -0,0 +0,0 @@ test
 807 |     
 808 | - if x and y:
 809 | + if not x and y:
 810 |     pass
 811 |     
 812 |     """
 813 |     
 814 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 815 | 
 816 | 
 817 | def test_change_unary_operator_8():
 818 | 
 819 |     test = """
 820 | @@ -0,0 +0,0 @@ test
 821 |     
 822 | - if x and y:
 823 | + if not (x and y):
 824 |     pass
 825 |     
 826 |     """
 827 |     
 828 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_UNARY_OPERATOR
 829 | 
 830 | 
 831 | 
 832 | # Change binary operator ----------------------------------------------------
 833 | 
 834 | def test_change_binary_operator_1():
 835 | 
 836 |     test = """
 837 | @@ -0,0 +0,0 @@ test
 838 |     
 839 | - if x and y:
 840 | + if x or y:
 841 |     pass
 842 |     
 843 |     """
 844 |     
 845 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 846 | 
 847 | 
 848 | def test_change_binary_operator_2():
 849 | 
 850 |     test = """
 851 | @@ -0,0 +0,0 @@ test
 852 |     
 853 | - if x or y:
 854 | + if x and y:
 855 |     pass
 856 |     
 857 |     """
 858 |     
 859 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 860 | 
 861 | 
 862 | def test_change_binary_operator_3():
 863 | 
 864 |     test = """
 865 | @@ -0,0 +0,0 @@ test
 866 |     
 867 | - if x + y:
 868 | + if x or y:
 869 |     pass
 870 |     
 871 |     """
 872 |     
 873 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 874 | 
 875 | 
 876 | def test_change_binary_operator_4():
 877 | 
 878 |     test = """
 879 | @@ -0,0 +0,0 @@ test
 880 |     
 881 | - if x and y:
 882 | + if x - y:
 883 |     pass
 884 |     
 885 |     """
 886 |     
 887 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 888 | 
 889 | 
 890 | def test_change_binary_operator_5():
 891 | 
 892 |     test = """
 893 | @@ -0,0 +0,0 @@ test
 894 |     
 895 | - if x + y:
 896 | + if x - y:
 897 |     pass
 898 |     
 899 |     """
 900 |     
 901 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 902 | 
 903 | 
 904 | def test_change_binary_operator_6():
 905 | 
 906 |     test = """
 907 | @@ -0,0 +0,0 @@ test
 908 |     
 909 | - if x + y:
 910 | + if x % y:
 911 |     pass
 912 |     
 913 |     """
 914 |     
 915 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 916 | 
 917 | 
 918 | def test_change_binary_operator_7():
 919 | 
 920 |     test = """
 921 | @@ -0,0 +0,0 @@ test
 922 |     
 923 | - if x + y:
 924 | + if x / y:
 925 |     pass
 926 |     
 927 |     """
 928 |     
 929 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 930 | 
 931 | 
 932 | def test_change_binary_operator_8():
 933 | 
 934 |     test = """
 935 | @@ -0,0 +0,0 @@ test
 936 |     
 937 | - if x + y < 5:
 938 | + if x + y <= 5:
 939 |     pass
 940 |     
 941 |     """
 942 |     
 943 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 944 | 
 945 | 
 946 | def test_change_binary_operator_9():
 947 | 
 948 |     test = """
 949 | @@ -0,0 +0,0 @@ test
 950 |     
 951 | - if x + y < 5 and is_t:
 952 | + if x + y <= 5 and is_t:
 953 |     pass
 954 |     
 955 |     """
 956 |     
 957 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERATOR
 958 | 
 959 | 
 960 | # Change binary operand -----------------------------------------------------
 961 | 
 962 | 
 963 | def test_change_binary_operand_1():
 964 | 
 965 |     test = """
 966 | @@ -0,0 +0,0 @@ test
 967 |     
 968 | - if x and y:
 969 | + if x and z:
 970 |     pass
 971 |     
 972 |     """
 973 |     
 974 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERAND
 975 | 
 976 | 
 977 | def test_change_binary_operand_2():
 978 | 
 979 |     test = """
 980 | @@ -0,0 +0,0 @@ test
 981 |     
 982 | - if x and y:
 983 | + if x and z <= 1:
 984 |     pass
 985 |     
 986 |     """
 987 |     
 988 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERAND
 989 | 
 990 | 
 991 | def test_change_binary_operand_3():
 992 | 
 993 |     test = """
 994 | @@ -0,0 +0,0 @@ test
 995 |     
 996 | - if x and y:
 997 | + if x > 8 and z <= 1:
 998 |     pass
 999 |     
1000 |     """
1001 |     
1002 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_BINARY_OPERAND
1003 | 
1004 | 
1005 | def test_change_binary_operand_4():
1006 | 
1007 |     test = """
1008 | @@ -0,0 +0,0 @@ test
1009 |     
1010 | - result = result + graphA / 2
1011 | + result = result + graphB / 2
1012 |     
1013 |     """
1014 |     
1015 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_BINARY_OPERAND
1016 | 
1017 | 
1018 | 
1019 | # Change attribute used ----------------------------------------------------------------
1020 | 
1021 | 
1022 | def test_change_attribute_used_1():
1023 | 
1024 |     test = """
1025 | @@ -0,0 +0,0 @@ test
1026 |     
1027 | - result = person.name
1028 | + result = person.age
1029 |     
1030 |     """
1031 |     
1032 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_ATTRIBUTE_USED
1033 | 
1034 | 
1035 | def test_change_attribute_used_2():
1036 | 
1037 |     test = """
1038 | @@ -0,0 +0,0 @@ test
1039 |     
1040 | - result = (x + y).name
1041 | + result = (x + y).age
1042 |     
1043 |     """
1044 |     
1045 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_ATTRIBUTE_USED
1046 | 
1047 | 
1048 | def test_change_attribute_used_3():
1049 | 
1050 |     test = """
1051 | @@ -0,0 +0,0 @@ test
1052 |     
1053 | - result = person.name.name
1054 | + result = person.age.age
1055 |     
1056 |     """
1057 |     
1058 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_ATTRIBUTE_USED
1059 | 
1060 | 
1061 | def test_change_attribute_used_4():
1062 | 
1063 |     test = """
1064 | @@ -0,0 +0,0 @@ test
1065 |     
1066 | - result = person.name.name
1067 | + result = person.age.name
1068 |     
1069 |     """
1070 |     
1071 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_ATTRIBUTE_USED
1072 | 
1073 | 
1074 | 
1075 | # Change keyword argument used ----------------------------------------------------------
1076 | 
1077 | def test_change_keyword_argument_used_1():
1078 | 
1079 |     test = """
1080 | @@ -0,0 +0,0 @@ test
1081 |     
1082 | - result = Person(name = 5)
1083 | + result = Person(age  = 5)
1084 |     
1085 |     """
1086 |     
1087 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1088 | 
1089 | 
1090 | def test_change_keyword_argument_used_2():
1091 | 
1092 |     test = """
1093 | @@ -0,0 +0,0 @@ test
1094 |     
1095 | - result = Person(path = path)
1096 | + result = Person(paths = path)
1097 |     
1098 |     """
1099 |     
1100 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1101 | 
1102 | 
1103 | def test_change_keyword_argument_used_3():
1104 | 
1105 |     test = """
1106 | @@ -0,0 +0,0 @@ test
1107 |     
1108 | - result = Person(path = path)
1109 | + result = Person(path = paths)
1110 |     
1111 |     """
1112 |     
1113 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1114 | 
1115 | 
1116 | def test_change_keyword_argument_used_4():
1117 | 
1118 |     test = """
1119 | @@ -0,0 +0,0 @@ test
1120 |     
1121 | - result = Person(path = path)
1122 | + result = Person(path = path, path2 = path)
1123 |     
1124 |     """
1125 |     
1126 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1127 | 
1128 | 
1129 | def test_change_keyword_argument_used_5():
1130 | 
1131 |     test = """
1132 | @@ -0,0 +0,0 @@ test
1133 |     
1134 | - result = Person(path = path, path = path)
1135 | + result = Person(path = path, path2 = path)
1136 |     
1137 |     """
1138 |     
1139 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1140 | 
1141 | 
1142 | def test_change_keyword_argument_used_6():
1143 | 
1144 |     test = """
1145 | @@ -0,0 +0,0 @@ test
1146 |     
1147 | - result = Person(path2 = path, path = path)
1148 | + result = Person(path = path, path2 = path)
1149 |     
1150 |     """
1151 |     
1152 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_KEYWORD_ARGUMENT_USED
1153 | 
1154 | # Change constant type used --------------------------------------------------------------
1155 | 
1156 | def test_change_constant_type_used_1():
1157 | 
1158 |     test = """
1159 | @@ -0,0 +0,0 @@ test
1160 |     
1161 | - result = 3
1162 | + result = 3.0
1163 |     
1164 |     """
1165 |     
1166 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_CONSTANT_TYPE
1167 | 
1168 | 
1169 | def test_change_constant_type_used_2():
1170 | 
1171 |     test = """
1172 | @@ -0,0 +0,0 @@ test
1173 |     
1174 | - result = 3
1175 | + result = 3.1
1176 |     
1177 |     """
1178 |     
1179 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_CONSTANT_TYPE
1180 | 
1181 | 
1182 | def test_change_constant_type_used_3():
1183 | 
1184 |     test = """
1185 | @@ -0,0 +0,0 @@ test
1186 |     
1187 | - result = 3
1188 | + result = '3'
1189 |     
1190 |     """
1191 |     
1192 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_CONSTANT_TYPE
1193 | 
1194 | 
1195 | def test_change_constant_type_used_4():
1196 | 
1197 |     test = """
1198 | @@ -0,0 +0,0 @@ test
1199 |     
1200 | - result = "3"
1201 | + result = '3'
1202 |     
1203 |     """
1204 |     
1205 |     assert compute_diff_sstub(test) != SStubPattern.CHANGE_CONSTANT_TYPE
1206 | 
1207 | 
1208 | def test_change_constant_type_used_5():
1209 | 
1210 |     test = """
1211 | @@ -0,0 +0,0 @@ test
1212 |     
1213 | - result = 3.0
1214 | + result = '3'
1215 |     
1216 |     """
1217 |     
1218 |     assert compute_diff_sstub(test) == SStubPattern.CHANGE_CONSTANT_TYPE
1219 | 
1220 | # Add elements to iterable ----------------------------------------------------------------
1221 | 
1222 | def test_add_elements_to_iterable_1():
1223 |     test = """
1224 | @@ -0,0 +0,0 @@ test
1225 |     
1226 | - result = ()
1227 | + result = (1,)
1228 |     
1229 |     """
1230 |     
1231 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1232 | 
1233 | 
1234 | def test_add_elements_to_iterable_2():
1235 |     test = """
1236 | @@ -0,0 +0,0 @@ test
1237 |     
1238 | - result = (1,)
1239 | + result = (1,2,)
1240 |     
1241 |     """
1242 |     
1243 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1244 | 
1245 | 
1246 | def test_add_elements_to_iterable_3():
1247 |     test = """
1248 | @@ -0,0 +0,0 @@ test
1249 |     
1250 | - result = (1,)
1251 | + result = (1,2, x + 1)
1252 |     
1253 |     """
1254 |     
1255 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1256 | 
1257 | def test_add_elements_to_iterable_4():
1258 |     test = """
1259 | @@ -0,0 +0,0 @@ test
1260 |     
1261 | - result = (1,)
1262 | + result = (1,2, x + 1, fn())
1263 |     
1264 |     """
1265 |     
1266 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1267 | 
1268 | 
1269 | def test_add_elements_to_iterable_5():
1270 |     test = """
1271 | @@ -0,0 +0,0 @@ test
1272 |     
1273 | - result = (1,)
1274 | + result = [1,2, x + 1, fn()]
1275 |     
1276 |     """
1277 |     
1278 |     assert compute_diff_sstub(test) != SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1279 | 
1280 | 
1281 | def test_add_elements_to_iterable_6():
1282 |     test = """
1283 | @@ -0,0 +0,0 @@ test
1284 |     
1285 | - result = [1,2,]
1286 | + result = [1,2, x + 1, fn()]
1287 |     
1288 |     """
1289 |     
1290 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1291 | 
1292 | 
1293 | def test_add_elements_to_iterable_7():
1294 |     test = """
1295 | @@ -0,0 +0,0 @@ test
1296 |     
1297 | - result = [1,2,]
1298 | + result = [1, x + 1, fn(), 2]
1299 |     
1300 |     """
1301 |     
1302 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1303 | 
1304 | 
1305 | def test_add_elements_to_iterable_8():
1306 |     test = """
1307 | @@ -0,0 +0,0 @@ test
1308 |     
1309 | - result = [1,2,]
1310 | + result = [1, x + 1, fn()]
1311 |     
1312 |     """
1313 |     
1314 |     assert compute_diff_sstub(test) != SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1315 | 
1316 | 
1317 | def test_add_elements_to_iterable_9():
1318 |     test = """
1319 | @@ -0,0 +0,0 @@ test
1320 |     
1321 | - result = {1,2,}
1322 | + result = {1,2, x + 1, fn()}
1323 |     
1324 |     """
1325 |     
1326 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ELEMENTS_TO_ITERABLE
1327 | 
1328 | # Add attribute access ---------------------------------------------------------------------
1329 | 
1330 | def test_add_attribute_access_1():
1331 |     test = """
1332 | @@ -0,0 +0,0 @@ test
1333 |     
1334 | - result = say_hello_to(person)
1335 | + result = say_hello_to(person.name)
1336 |     
1337 |     """
1338 |     
1339 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ATTRIBUTE_ACCESS
1340 | 
1341 | 
1342 | def test_add_attribute_access_2():
1343 |     test = """
1344 | @@ -0,0 +0,0 @@ test
1345 |     
1346 | - result = person.age
1347 | + result = person.parent.age
1348 |     
1349 |     """
1350 |     
1351 |     assert compute_diff_sstub(test) == SStubPattern.ADD_ATTRIBUTE_ACCESS
1352 | 
1353 | 
1354 | # More specific if ------------------------------------------------------------------------
1355 | 
1356 | def test_more_specific_if_1():
1357 |     test = """
1358 | @@ -0,0 +0,0 @@ test
1359 |     
1360 | - if x:
1361 | + if x and y:
1362 |     pass
1363 |     
1364 |     """
1365 |     
1366 |     assert compute_diff_sstub(test) == SStubPattern.MORE_SPECIFIC_IF
1367 | 
1368 | 
1369 | def test_more_specific_if_2():
1370 |     test = """
1371 | @@ -0,0 +0,0 @@ test
1372 |     
1373 | - if isinstance(x):
1374 | + if isinstance(x, y):
1375 |     pass
1376 |     
1377 |     """
1378 |     
1379 |     assert compute_diff_sstub(test) != SStubPattern.MORE_SPECIFIC_IF
1380 | 
1381 | 
1382 | def test_more_specific_if_3():
1383 |     test = """
1384 | @@ -0,0 +0,0 @@ test
1385 |     
1386 | - if x:
1387 | + if not x:
1388 |     pass
1389 |     
1390 |     """
1391 |     
1392 |     assert compute_diff_sstub(test) != SStubPattern.MORE_SPECIFIC_IF
1393 | 
1394 | 
1395 | def test_more_specific_if_4():
1396 |     test = """
1397 | @@ -0,0 +0,0 @@ test
1398 |     
1399 | - if x and test():
1400 | + if x and test() or test2():
1401 |     pass
1402 |     
1403 |     """
1404 |     
1405 |     assert compute_diff_sstub(test) != SStubPattern.MORE_SPECIFIC_IF
1406 | 
1407 | # Less specific if ------------------------------------------------------------------------
1408 | 
1409 | def test_less_specific_if_1():
1410 |     test = """
1411 | @@ -0,0 +0,0 @@ test
1412 |     
1413 | - if x:
1414 | + if x or y:
1415 |     pass
1416 |     
1417 |     """
1418 |     
1419 |     assert compute_diff_sstub(test) == SStubPattern.LESS_SPECIFIC_IF
1420 | 
1421 | 
1422 | def test_less_specific_if_2():
1423 |     test = """
1424 | @@ -0,0 +0,0 @@ test
1425 |     
1426 | - if isinstance(x, y):
1427 | + if isinstance(x):
1428 |     pass
1429 |     
1430 |     """
1431 |     
1432 |     assert compute_diff_sstub(test) != SStubPattern.LESS_SPECIFIC_IF
1433 | 
1434 | 
1435 | def test_less_specific_if_3():
1436 |     test = """
1437 | @@ -0,0 +0,0 @@ test
1438 |     
1439 | - if not x:
1440 | + if x:
1441 |     pass
1442 |     
1443 |     """
1444 |     
1445 |     assert compute_diff_sstub(test) != SStubPattern.LESS_SPECIFIC_IF
1446 | 
1447 | 
1448 | def test_less_specific_if_4():
1449 |     test = """
1450 | @@ -0,0 +0,0 @@ test
1451 |     
1452 | - if x and test():
1453 | + if x and test() or test2():
1454 |     pass
1455 |     
1456 |     """
1457 |     
1458 |     assert compute_diff_sstub(test) == SStubPattern.LESS_SPECIFIC_IF
1459 | 
1460 | 
1461 | # Real world tests ----------------------------------------------------------------
1462 | 
1463 | def test_real_world_1():
1464 | 
1465 |     test = """
1466 | @@ -16,7 +16,7 @@ def test_databases():
1467 |      bench2 = Benchmark(statement, setup, name='list with xrange',
1468 |          description='Xrange', start_date=datetime(2013, 3, 9))
1469 |  
1470 | -    dbHandler = BenchmarkDb.get_instance('bench.db')
1471 | +    dbHandler = BenchmarkDb('bench.db')
1472 |     """
1473 | 
1474 |     assert compute_diff_sstub(test) == SStubPattern.SINGLE_STMT
1475 | 
1476 | 
1477 | 
1478 | def test_real_world_2():
1479 | 
1480 |     test = """
1481 | @@ -146,7 +146,7 @@ class DatetimeWidget(DateWidget):
1482 |          if default in (year, month, day, hour, minute):
1483 |              return default
1484 | 
1485 | -        if self.ampm is True and hour != 12:
1486 | +        if self.ampm is True and int(hour)!=12:
1487 |              ampm = self.request.get(self.name + '-ampm', default)
1488 |              if ampm == 'PM':
1489 |                  hour = str(12+int(hour))
1490 |     """
1491 | 
1492 |     assert compute_diff_sstub(test) == SStubPattern.ADD_FUNCTION_AROUND_EXPRESSION
1493 | 
1494 | 
1495 | def test_real_world_3():
1496 | 
1497 |     test = """
1498 | @@ -59,7 +59,8 @@ class UrlRewriteFilter(object):
1499 |          if ext in CONTENT_TYPES:
1500 |              # Use the content type specified by the extension
1501 |              return (path, CONTENT_TYPES[ext])
1502 | -        elif http_accept is None:
1503 | +        elif http_accept is None or http_accept == '*/*':
1504 | +            # TODO: This probably isn't the best place to handle "Accept: */*"
1505 |              # No extension or Accept header specified, use default
1506 |              return (path_info, DEFAULT_CONTENT_TYPE)
1507 |          else:
1508 |     
1509 |     """
1510 | 
1511 |     assert compute_diff_sstub(test) == SStubPattern.LESS_SPECIFIC_IF


--------------------------------------------------------------------------------