├── .bettercodehub.yml ├── .gitignore ├── .gitmodules ├── .pylintrc ├── LICENSE ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── GraphRepoArch.svg │ ├── GraphRepoArch_old.svg │ ├── GraphRepoDS.svg │ ├── GraphRepoSchema.svg │ ├── _templates │ └── breadcrumbs.html │ ├── architecture.rst │ ├── conf.py │ ├── configuration.rst │ ├── css │ └── custom.css │ ├── data_structure.rst │ ├── driller.rst │ ├── examples.rst │ ├── index.rst │ ├── installation.rst │ ├── mappers.rst │ └── miners.rst ├── examples ├── __init__.py ├── all_method_complexity.py ├── benchmarks │ ├── all_data.py │ ├── all_methods_complexity.py │ ├── dev_files.py │ ├── dev_methods.py │ └── file_nloc.py ├── configs │ ├── graphrepo.yml │ ├── grepo-test.yml │ ├── hadoop.yml │ ├── jax.yml │ ├── kibana.yml │ ├── pydriller.yml │ └── tensorflow.yml ├── dev_data.py ├── file_complexity.py ├── index_all.py └── mine_all.py ├── graphrepo ├── __init__.py ├── config.py ├── drillers │ ├── __init__.py │ ├── batch_utils.py │ ├── cache_driller.py │ ├── db_init.py │ ├── default.py │ ├── delete_all.py │ ├── drill_cache.py │ ├── driller.py │ ├── queue_driller.py │ ├── rabbit_driller.py │ └── stomp_driller.py ├── logger.py ├── mappers │ ├── __init__.py │ ├── csv.py │ └── default.py ├── miners │ ├── __init__.py │ ├── commit.py │ ├── default.py │ ├── developer.py │ ├── file.py │ ├── method.py │ ├── mine_manager.py │ └── utils.py ├── singleton.py └── utils.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── cnfg_init.yml ├── cnfg_simple.yml ├── test_cache_driller.py ├── test_commit.py ├── test_commit_miner.py ├── test_csv_mapper.py ├── test_db_init.py ├── test_dev_miner.py ├── test_driller.py ├── test_file.py ├── test_file_miner.py ├── test_method_miner.py ├── test_queue_driller.py └── test_utils.py /.bettercodehub.yml: -------------------------------------------------------------------------------- 1 | component_depth: 1 2 | languages: 3 | - python 4 | exclude: 5 | - /examples/.* 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # VSCode 2 | .vscode/ 3 | 4 | # repo 5 | repos/ 6 | data/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/gr-test"] 2 | path = tests/gr-test 3 | url = https://github.com/NullConvergence/gr-test 4 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=CVS 13 | 14 | # Add files or directories matching the regex patterns to the blacklist. The 15 | # regex matches against base names, not paths. 16 | ignore-patterns= 17 | 18 | # Pickle collected data for later comparisons. 19 | persistent=yes 20 | 21 | # List of plugins (as comma separated values of python modules names) to load, 22 | # usually to register additional checkers. 23 | load-plugins= 24 | 25 | # Use multiple processes to speed up Pylint. 26 | jobs=1 27 | 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the 29 | # active Python interpreter and may run arbitrary code. 30 | unsafe-load-any-extension=no 31 | 32 | # A comma-separated list of package or module names from where C extensions may 33 | # be loaded. Extensions are loading into the active Python interpreter and may 34 | # run arbitrary code 35 | extension-pkg-whitelist=numpy 36 | 37 | # Allow optimization of some AST trees. This will activate a peephole AST 38 | # optimizer, which will apply various small optimizations. For instance, it can 39 | # be used to obtain the result of joining multiple strings with the addition 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting 42 | # AST will be different than the one from reality. This option is deprecated 43 | # and it will be removed in Pylint 2.0. 44 | optimize-ast=no 45 | 46 | 47 | [MESSAGES CONTROL] 48 | 49 | # Only show warnings with the listed confidence levels. Leave empty to show 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 51 | confidence= 52 | 53 | # Enable the message, report, category or checker with the given id(s). You can 54 | # either give multiple identifier separated by comma (,) or put this option 55 | # multiple time (only on the command line, not in the configuration file where 56 | # it should appear only once). See also the "--disable" option for examples. 57 | #enable= 58 | 59 | # Disable the message, report, category or checker with the given id(s). You 60 | # can either give multiple identifiers separated by comma (,) or put this 61 | # option multiple times (only on the command line, not in the configuration 62 | # file where it should appear only once).You can also use "--disable=all" to 63 | # disable everything first and then reenable specific checks. For example, if 64 | # you want to run only the similarities checker, you can use "--disable=all 65 | # --enable=similarities". If you want to run only the classes checker, but have 66 | # no Warning level messages displayed, use"--disable=all --enable=classes 67 | # --disable=W" 68 | disable=long-suffix,standarderror-builtin,indexing-exception,delslice-method,unichr-builtin,dict-view-method,parameter-unpacking,unicode-builtin,cmp-builtin,intern-builtin,round-builtin,backtick,nonzero-method,xrange-builtin,coerce-method,raw_input-builtin,old-division,filter-builtin-not-iterating,old-octal-literal,input-builtin,map-builtin-not-iterating,buffer-builtin,basestring-builtin,zip-builtin-not-iterating,using-cmp-argument,unpacking-in-except,old-raise-syntax,coerce-builtin,dict-iter-method,hex-method,range-builtin-not-iterating,useless-suppression,cmp-method,print-statement,reduce-builtin,file-builtin,long-builtin,getslice-method,execfile-builtin,no-absolute-import,metaclass-assignment,oct-method,reload-builtin,import-star-module-level,suppressed-message,apply-builtin,raising-string,next-method-called,setslice-method,old-ne-operator,arguments-differ,wildcard-import,locally-disabled 69 | 70 | 71 | [REPORTS] 72 | 73 | # Set the output format. Available formats are text, parseable, colorized, msvs 74 | # (visual studio) and html. You can also give a reporter class, eg 75 | # mypackage.mymodule.MyReporterClass. 76 | output-format=text 77 | 78 | # Put messages in a separate file for each module / package specified on the 79 | # command line instead of printing them on stdout. Reports (if any) will be 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated 81 | # and it will be removed in Pylint 2.0. 82 | files-output=no 83 | 84 | # Tells whether to display a full report or only the messages 85 | reports=yes 86 | 87 | # Python expression which should return a note less than 10 (10 is the highest 88 | # note). You have access to the variables errors warning, statement which 89 | # respectively contain the number of errors / warnings messages and the total 90 | # number of statements analyzed. This is used by the global evaluation report 91 | # (RP0004). 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 93 | 94 | # Template used to display messages. This is a python new-style format string 95 | # used to format the message information. See doc for all details 96 | #msg-template= 97 | 98 | 99 | [BASIC] 100 | 101 | # Good variable names which should always be accepted, separated by a comma 102 | good-names=i,j,k,ex,Run,_ 103 | 104 | # Bad variable names which should always be refused, separated by a comma 105 | bad-names=foo,bar,baz,toto,tutu,tata 106 | 107 | # Colon-delimited sets of names that determine each other's naming style when 108 | # the name regexes allow several styles. 109 | name-group= 110 | 111 | # Include a hint for the correct naming format with invalid-name 112 | include-naming-hint=no 113 | 114 | # List of decorators that produce properties, such as abc.abstractproperty. Add 115 | # to this list to register other decorators that produce valid properties. 116 | property-classes=abc.abstractproperty 117 | 118 | # Regular expression matching correct variable names 119 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 120 | 121 | # Naming hint for variable names 122 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 123 | 124 | # Regular expression matching correct class attribute names 125 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 126 | 127 | # Naming hint for class attribute names 128 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 129 | 130 | # Regular expression matching correct argument names 131 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 132 | 133 | # Naming hint for argument names 134 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 135 | 136 | # Regular expression matching correct module names 137 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 138 | 139 | # Naming hint for module names 140 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 141 | 142 | # Regular expression matching correct constant names 143 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 144 | 145 | # Naming hint for constant names 146 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 147 | 148 | # Regular expression matching correct inline iteration names 149 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 150 | 151 | # Naming hint for inline iteration names 152 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 153 | 154 | # Regular expression matching correct method names 155 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 156 | 157 | # Naming hint for method names 158 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 159 | 160 | # Regular expression matching correct function names 161 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 162 | 163 | # Naming hint for function names 164 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 165 | 166 | # Regular expression matching correct attribute names 167 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 168 | 169 | # Naming hint for attribute names 170 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 171 | 172 | # Regular expression matching correct class names 173 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 174 | 175 | # Naming hint for class names 176 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 177 | 178 | # Regular expression which should only match function or class names that do 179 | # not require a docstring. 180 | no-docstring-rgx=^test_ 181 | 182 | # Minimum line length for functions/classes that require docstrings, shorter 183 | # ones are exempt. 184 | docstring-min-length=-1 185 | 186 | 187 | [ELIF] 188 | 189 | # Maximum number of nested blocks for function / method body 190 | max-nested-blocks=5 191 | 192 | 193 | [FORMAT] 194 | 195 | # Maximum number of characters on a single line. 196 | max-line-length=80 197 | 198 | # Regexp for a line that is allowed to be longer than the limit. 199 | ignore-long-lines=^\s*(# )??$ 200 | 201 | # Allow the body of an if to be on the same line as the test if there is no 202 | # else. 203 | single-line-if-stmt=y 204 | 205 | # List of optional constructs for which whitespace checking is disabled. `dict- 206 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 207 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 208 | # `empty-line` allows space-only lines. 209 | no-space-check=trailing-comma,dict-separator 210 | 211 | # Maximum number of lines in a module 212 | max-module-lines=1000 213 | 214 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 215 | # tab). 216 | indent-string=' ' 217 | 218 | # Number of spaces of indent required inside a hanging or continued line. 219 | indent-after-paren=4 220 | 221 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 222 | expected-line-ending-format= 223 | 224 | 225 | [LOGGING] 226 | 227 | # Logging modules to check that the string format arguments are in logging 228 | # function parameter format 229 | logging-modules=logging 230 | 231 | 232 | [MISCELLANEOUS] 233 | 234 | # List of note tags to take in consideration, separated by a comma. 235 | notes=FIXME,XXX,TODO 236 | 237 | 238 | [SIMILARITIES] 239 | 240 | # Minimum lines number of a similarity. 241 | min-similarity-lines=10 242 | 243 | # Ignore comments when computing similarities. 244 | ignore-comments=yes 245 | 246 | # Ignore docstrings when computing similarities. 247 | ignore-docstrings=yes 248 | 249 | # Ignore imports when computing similarities. 250 | ignore-imports=no 251 | 252 | 253 | [SPELLING] 254 | 255 | # Spelling dictionary name. Available dictionaries: none. To make it working 256 | # install python-enchant package. 257 | spelling-dict= 258 | 259 | # List of comma separated words that should not be checked. 260 | spelling-ignore-words= 261 | 262 | # A path to a file that contains private dictionary; one word per line. 263 | spelling-private-dict-file= 264 | 265 | # Tells whether to store unknown words to indicated private dictionary in 266 | # --spelling-private-dict-file option instead of raising a message. 267 | spelling-store-unknown-words=no 268 | 269 | 270 | [TYPECHECK] 271 | 272 | # Tells whether missing members accessed in mixin class should be ignored. A 273 | # mixin class is detected if its name ends with "mixin" (case insensitive). 274 | ignore-mixin-members=yes 275 | 276 | # List of module names for which member attributes should not be checked 277 | # (useful for modules/projects where namespaces are manipulated during runtime 278 | # and thus existing member attributes cannot be deduced by static analysis. It 279 | # supports qualified module names, as well as Unix pattern matching. 280 | ignored-modules= 281 | 282 | # List of class names for which member attributes should not be checked (useful 283 | # for classes with dynamically set attributes). This supports the use of 284 | # qualified names. 285 | ignored-classes=optparse.Values,thread._local,_thread._local,matplotlib.cm,tensorflow.python,tensorflow,tensorflow.train.Example,RunOptions 286 | 287 | # List of members which are set dynamically and missed by pylint inference 288 | # system, and so shouldn't trigger E1101 when accessed. Python regular 289 | # expressions are accepted. 290 | generated-members=set_shape,np.float32 291 | 292 | # List of decorators that produce context managers, such as 293 | # contextlib.contextmanager. Add to this list to register other decorators that 294 | # produce valid context managers. 295 | contextmanager-decorators=contextlib.contextmanager 296 | 297 | 298 | [VARIABLES] 299 | 300 | # Tells whether we should check for unused import in __init__ files. 301 | init-import=no 302 | 303 | # A regular expression matching the name of dummy variables (i.e. expectedly 304 | # not used). 305 | dummy-variables-rgx=(_+[a-zA-Z0-9_]*?$)|dummy 306 | 307 | # List of additional names supposed to be defined in builtins. Remember that 308 | # you should avoid to define new builtins when possible. 309 | additional-builtins= 310 | 311 | # List of strings which can identify a callback function by name. A callback 312 | # name must start or end with one of those strings. 313 | callbacks=cb_,_cb 314 | 315 | # List of qualified module names which can have objects that can redefine 316 | # builtins. 317 | redefining-builtins-modules=six.moves,future.builtins 318 | 319 | 320 | [CLASSES] 321 | 322 | # List of method names used to declare (i.e. assign) instance attributes. 323 | defining-attr-methods=__init__,__new__,setUp 324 | 325 | # List of valid names for the first argument in a class method. 326 | valid-classmethod-first-arg=cls 327 | 328 | # List of valid names for the first argument in a metaclass class method. 329 | valid-metaclass-classmethod-first-arg=mcs 330 | 331 | # List of member names, which should be excluded from the protected access 332 | # warning. 333 | exclude-protected=_asdict,_fields,_replace,_source,_make 334 | 335 | 336 | [DESIGN] 337 | 338 | # Maximum number of arguments for function / method 339 | max-args=10 340 | 341 | # Argument names that match this expression will be ignored. Default to name 342 | # with leading underscore 343 | ignored-argument-names=_.* 344 | 345 | # Maximum number of locals for function / method body 346 | max-locals=30 347 | 348 | # Maximum number of return / yield for function / method body 349 | max-returns=6 350 | 351 | # Maximum number of branch for function / method body 352 | max-branches=12 353 | 354 | # Maximum number of statements in function / method body 355 | max-statements=100 356 | 357 | # Maximum number of parents for a class (see R0901). 358 | max-parents=7 359 | 360 | # Maximum number of attributes for a class (see R0902). 361 | max-attributes=10 362 | 363 | # Minimum number of public methods for a class (see R0903). 364 | min-public-methods=0 365 | 366 | # Maximum number of public methods for a class (see R0904). 367 | max-public-methods=20 368 | 369 | # Maximum number of boolean expressions in a if statement 370 | max-bool-expr=5 371 | 372 | 373 | [IMPORTS] 374 | 375 | # Deprecated modules which should not be used, separated by a comma 376 | deprecated-modules=optparse 377 | 378 | # Create a graph of every (i.e. internal and external) dependencies in the 379 | # given file (report RP0402 must not be disabled) 380 | import-graph= 381 | 382 | # Create a graph of external dependencies in the given file (report RP0402 must 383 | # not be disabled) 384 | ext-import-graph= 385 | 386 | # Create a graph of internal dependencies in the given file (report RP0402 must 387 | # not be disabled) 388 | int-import-graph= 389 | 390 | # Force import order to recognize a module as part of the standard 391 | # compatibility libraries. 392 | known-standard-library= 393 | 394 | # Force import order to recognize a module as part of a third party library. 395 | known-third-party=enchant 396 | 397 | # Analyse import fallback blocks. This can be used to support both Python 2 and 398 | # 3 compatible code, which means that the block might have code that exists 399 | # only in one or another interpreter, leading to false positives when analysed. 400 | analyse-fallback-blocks=no 401 | 402 | 403 | [EXCEPTIONS] 404 | 405 | # Exceptions that will emit a warning when being caught. Defaults to 406 | # "Exception" 407 | overgeneral-exceptions=Exception 408 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphRepo ![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square) [![BCH compliance](https://bettercodehub.com/edge/badge/NullConvergence/GraphRepo?branch=develop)](https://bettercodehub.com/) 2 | 3 | GraphRepo is a tool for mining software repositories in real time. It indexes Git repositories in Neo4j and implements multiple queries to select and process the repository data. 4 | 5 | For a complete description, see the [online documentation](https://graphrepo.readthedocs.io/en/latest/). 6 | 7 | 8 |

9 | 10 |

x 11 | 12 | ### 1. Installation & First run 13 | 14 | #### 1.1 Prereq 15 | The only requirement is to have Python >=3.5 and Docker installed on your system. 16 | 17 | #### 1.2 Install using pip 18 | 19 | The production release can be installed using pip: 20 | 21 | ``` 22 | $ pip install graphrepo 23 | ``` 24 | 25 | 36 | 37 | 38 | #### 1.3 Run and configure Neo4j 39 | 40 | The following instructions assume the Docker daemon is running on your machine: 41 | 42 | ``` 43 | $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins -e NEO4JLABS_PLUGINS=\[\"apoc\"\] -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11 44 | ``` 45 | 46 | Open a browser window and go to [http://localhost:7474](http://localhost:7474). Here you can configure the neo4j password. 47 | The default one is *neo4jj*. 48 | 49 | ##### Optionally, configure Neo4j to allow larger heap size using the following attributes with the command above: 50 | 51 | ``` 52 | --env NEO4J_dbms_memory_pagecache_size=4g 53 | --env NEO4J_dbms_memory_heap_max__size=4g 54 | ``` 55 | 56 | #### 1.4. Index and vizualize a repo 57 | 58 | In order to index a repository, you must clone it on localhost, and point GraphRepo to it. For example: 59 | ``` 60 | $ mkdir repos 61 | $ cd repos 62 | $ git clone https://github.com/ishepard/pydriller 63 | ``` 64 | 65 | Now enter the [examples](/examples) folder from this repository, and edit the configuration file for PyDriller to reflect the database URL and desired batch size: 66 | ``` 67 | $ cd ../examples/ 68 | $ nano configs/pydriller.yml 69 | ``` 70 | 71 | Afterwards, we can run the script from the examples folder which indexes the repository in Neo4j: 72 | 73 | ``` 74 | $ python -m examples.index_all --config=examples/configs/pydriller.yml 75 | ``` 76 | 77 | Go to [http://localhost:7474](http://localhost:7474) and use the query from 3.1 78 | 79 | 80 | #### 1.5. Retrieve all data from Neo4j using GraphRepo 81 | 82 | Assuming you succeded in step 1.4, use the follwing command to retrieve all indexed data: 83 | 84 | ``` 85 | $ python -m examples.mine_all --config=examples/configs/pydriller.yml 86 | ``` 87 | 88 | 89 | ### 2. Examples 90 | 91 | For a comprehensive introduction and more examples, see the [documentation](https://graphrepo.readthedocs.io/en/latest/examples.html). 92 | 93 | 94 | 95 | ### 3. Useful Neo4j queries for the web interface 96 | 97 | #### 3.1 Match all nodes in a graph 98 | ``` 99 | MATCH (n) RETURN n 100 | ``` 101 | 102 | 103 | #### 3.2 Delete all nodes and relationships in a graph 104 | 105 | ``` 106 | MATCH (n) DETACH DELETE n; 107 | ``` 108 | 109 | #### 3.2 Delete a limited number commits and relationship 110 | 111 | ``` 112 | MATCH (n:Commit) 113 | // Take the first 100 commits nodes and their rels 114 | WITH n LIMIT 100 115 | DETACH DELETE n 116 | RETURN count(*); 117 | ``` 118 | 119 | 120 | 121 | This project is enabled by [Pydriller](https://github.com/ishepard/pydriller). 122 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/GraphRepoDS.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
Dev.
Dev.
File
File
Method
Method
Branch
Branch
Commit
Commit
Author
Author
Parent
Parent
Branch
Branch
Commit
Commit
Update
File
Update...
Update
Method
Update...
Method
Method
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /docs/source/GraphRepoSchema.svg: -------------------------------------------------------------------------------- 1 | Neo4j Graph VisualizationCreated using Neo4j (http://www.neo4j.com/)ParentBranchCommitUpdateFileAuthorUpdateMethodMethod Branch Commit Developer File Method -------------------------------------------------------------------------------- /docs/source/_templates/breadcrumbs.html: -------------------------------------------------------------------------------- 1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %} 2 | 3 | {% block breadcrumbs_aside %} 4 | {% endblock %} -------------------------------------------------------------------------------- /docs/source/architecture.rst: -------------------------------------------------------------------------------- 1 | .. _architecture_toplevel: 2 | 3 | ================== 4 | Architecture 5 | ================== 6 | 7 | GraphRepo consists of 3 main components: 8 | 9 | * :ref:`DRILLERS` - components used to parse data from a git repository and insert records in Neo4j, 10 | * :ref:`MINERS` and MinerManager - components which hold default queries and interfaces for retrieving data from Neo4j, and 11 | * :ref:`MAPPERS` - components used to transform the data retrieved by Miners in specific format, filter or sort data. 12 | 13 | The advantage of using custom mappers is that the load on Neo4j can be decreased, 14 | using lighter queries to extract the data and more intensive data processing in the 15 | custom mappers. For example, one can write a mapper using PySpark on raw data extracted 16 | from Neo4j and use the Apache Spark engine for scalability. 17 | 18 | .. image:: /GraphRepoArch.svg 19 | :width: 400 20 | :align: center 21 | 22 | 23 | Specific information about each component can be found using the links above. -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | import os 18 | import sys 19 | sys.path.insert(0, os.path.abspath('../')) 20 | 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = 'GraphRepo' 25 | copyright = '2021, GraphRepo' 26 | author = 'GraphRepo' 27 | 28 | # The full version, including alpha/beta/rc tags 29 | version = '' 30 | release = '1.0.0' 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | master_doc = 'index' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = ['sphinx.ext.autodoc', 41 | 'sphinx.ext.doctest'] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # List of patterns, relative to source directory, that match files and 47 | # directories to ignore when looking for source files. 48 | # This pattern also affects html_static_path and html_extra_path. 49 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 50 | 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | 54 | # The theme to use for HTML and HTML Help pages. See the documentation for 55 | # a list of builtin themes. 56 | # 57 | html_theme = 'sphinx_rtd_theme' 58 | 59 | # Add any paths that contain custom static files (such as style sheets) here, 60 | # relative to this directory. They are copied after the builtin static files, 61 | # so a file named "default.css" will overwrite the builtin "default.css". 62 | html_static_path = ['_static'] 63 | 64 | 65 | html_css_files = [ 66 | 'css/custom.css', 67 | ] -------------------------------------------------------------------------------- /docs/source/configuration.rst: -------------------------------------------------------------------------------- 1 | .. _CONFIGURATION: 2 | 3 | ================== 4 | Configuration 5 | ================== 6 | 7 | For any activity, GraphRepo uses a yaml (.yml) configuration with 2 objects: 8 | 9 | * a Neo4j instance configuration, and 10 | * a repository configuration, 11 | 12 | as follows:: 13 | 14 | neo: 15 | db_url: localhost # the url for the Neo4j database 16 | port: 7687 # the Neo4j port 17 | db_user: neo4j # Neo4j authentication username 18 | db_pwd: neo4jj # Neo4j authentication password 19 | batch_size: 100 # the batch size for inserting the records in Neo4j - this setting depends on the Neo4j resources 20 | 21 | project: 22 | repo: "repos/graphrepo/" # the repository filepath 23 | start_date: "1 February, 2018" # the start date for indexing (leave empty if it corresponds with the initial start date of the project) 24 | end_date: "30 March, 2018" # the start date for indexing (leave empty if it corresponds with the last commit) 25 | project_id: "graphrepo" # a unique project id for the database 26 | index_code: False # boolean, if True GraphRepo indexes for each file touched by a commit the source code before and after the commit. This parameter significantly increases the index time and the hardware resources needed for Neo4j. For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True. 27 | index_developer_email: True # boolean, if True, GraphRepo indexes the developer emails in the Developer node. Turn flag off for GDPR or any other privacy concerns 28 | 29 | 30 | 31 | Neo4j configuration 32 | ==================== 33 | 34 | GraphRepo connects to Neo4j using the Bold REST API from `py2neo `_. 35 | Currently the only attributes needed to connect to Neo4j are the url+port and the authentication credentials. 36 | All other configurations (e.g., setting the user permissions) are done on the database side. 37 | 38 | 39 | Repository configuration 40 | ======================== 41 | 42 | In order to insert a repository in the database, it has to be cloned on the local machine (where GraphRepo will run). 43 | Afterwards, it can be linked with GraphRepo using the ``project.repo`` attribute in the config file. 44 | 45 | If one does not want to use all the repository data (e.g., if the repository is very large), it can configure 46 | the index dates using the ``project.start_date`` and ``project.end_date`` attributes. 47 | 48 | The ``project.project_id`` attribute is used to give each project a unique identifier. 49 | Currently, GraphRepo indexes all repositories in the same database, in order to allow information about teams of developers that work 50 | on distinct projects to be mined without merging databases. 51 | 52 | 53 | The ``project.index_code`` attribute decides if GraphRepo indexes, for each file touched by a commit, the source code before and after the commit. 54 | This parameter significantly increases the index time and the hardware resources needed for Neo4j. 55 | For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True. 56 | 57 | 58 | For examples of config files, see the projects repository, ``examples/configs/pydriller.yml``. 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /docs/source/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Hide "On GitHub" section from versions menu */ 2 | div.rst-versions>div.rst-other-versions>div.injected>dl:nth-child(4) { 3 | display: none; 4 | } 5 | 6 | .wy-breadcrumbs-aside { 7 | 8 | display: none; 9 | } -------------------------------------------------------------------------------- /docs/source/data_structure.rst: -------------------------------------------------------------------------------- 1 | .. _DS: 2 | 3 | ================== 4 | Schema 5 | ================== 6 | 7 | The resulting Neo4j schema consists of 5 node types and 6 relationship types, as illustrated below: 8 | 9 | .. figure:: /GraphRepoSchema.svg 10 | :width: 45 % 11 | :align: center 12 | 13 | Nodes 14 | =========== 15 | 16 | 17 | Branch 18 | ----------- 19 | 20 | Each branch identified by PyDriller is indexed as a node with the following attributes:: 21 | 22 | { 23 | "hash": "string - unique identifier", 24 | "project_id": "string - project id from config (can be used to select all branches from a project)", 25 | "name": "string - branch name", 26 | } 27 | 28 | Commit 29 | ----------- 30 | 31 | Each commit is indexed as a node with the following attributes:: 32 | 33 | { 34 | "hash": "string - unique identifier in Neo4j", 35 | "commit_hash": "string - commit hash in git", 36 | "message": "string - commit message in git", 37 | "is_merge": "int - 1 if the commit is merge, 0 otherwise", 38 | "timestamp": "int - Unix epoch, time of the commit", 39 | "project_id": "string - project id from config (can be used to select all branches from a project)", 40 | "dmm_unit_complexity": "int, see Pydriller", 41 | "dmm_unit_interfacing": "int, see Pydriller", 42 | "dmm_unit_size": "int, see Pydriller" 43 | } 44 | 45 | 46 | 47 | Developer 48 | ----------- 49 | 50 | Each developer is indexed as a node with the following attributes:: 51 | 52 | { 53 | "hash": "string - unique identifier", 54 | "name": "string - developer name as in git", 55 | "email": "string - developer email as in git", 56 | } 57 | 58 | Currently the mail and email information is not anonymized. 59 | 60 | File 61 | ----------- 62 | 63 | 64 | Each file is indexed as a node with the following attributes:: 65 | 66 | { 67 | "hash": "string - unique identifier", 68 | "name": "string - file short name as in git", 69 | "project_id": "string - project id from config (can be used to select all branches from a project)", 70 | "type": "string - file extension, e.g., '.py'" 71 | } 72 | 73 | 74 | 75 | Method 76 | ----------- 77 | 78 | Each method is indexed as a node with the following attributes:: 79 | 80 | { 81 | "hash": "string - unique identifier", 82 | "name": "string - method name as in file", 83 | "file_name": "string - parent file name", 84 | "project_id": "string - project id from config (can be used to select all branches from a project)", 85 | "type": "string - file extension, e.g., '.py'" 86 | } 87 | 88 | 89 | 90 | Relationships 91 | =============== 92 | 93 | Author 94 | ----------- 95 | 96 | An Author relationship exists between each commit and its author. 97 | The direction is from Commit to Author and the relationship attributes are:: 98 | 99 | { 100 | "timestamp": "int - Unix epoch, time of the commit" 101 | } 102 | 103 | 104 | BranchCommit 105 | -------------- 106 | A BranchCommit relationship exists between each branch and the branch commits. 107 | The direction is from Branch to Commit. This relationship does not have any special attributes. 108 | 109 | 110 | Method 111 | ----------- 112 | 113 | An Method relationship exists between each file and its methods. 114 | The direction is from File to Method. This relationship does not have any special attributes. 115 | In order to find out if the method is still part of the file or it was deleted, we can use the FileMiner. 116 | 117 | 118 | Parent 119 | ----------- 120 | A parent relationship exists between each commit its parent/parents. 121 | This relationship does not have any special attributes. 122 | 123 | 124 | UpdateFile 125 | ----------- 126 | 127 | An UpdateFile relationship exists between a commit that edited a file and the edited file. 128 | The direction is from Commit to File and the relationship attributes are:: 129 | 130 | { 131 | "timestamp": "int - Unix epoch, time of the commit", 132 | "old_path": "string - old path, if the file was moved (see type attribute)", 133 | "path": "string - current file path", 134 | "diff": "string - commit diff", 135 | "source_code": "string - source code after the commit", 136 | "source_code_before": "string - source before after the commit", 137 | "nloc": "int - file lines of code after the commit", 138 | "complexity": "int - file complexity after the commit", 139 | "token_count": "int - number of tokens after the commit", 140 | "added": "int - number of lines added in commit", 141 | "removed": "int - number of lines removed in commit", 142 | "type": "string - type of update. Possible values are: 'ADD', 'COPY', 'RENAME', 'DELETE', 'MODIFY', 'UNKNOWN' " 143 | } 144 | 145 | 146 | UpdateMethod 147 | ------------- 148 | 149 | An UpdateMethod relationship exists between a commit that edited a method and the edited method. 150 | The direction is from Commit to Method and the relationship attributes are:: 151 | 152 | { 153 | "timestamp": "int - Unix epoch, time of the commit", 154 | "long_name": "string - method long name, including parameters", 155 | "parameters": "string - method parameters", 156 | "complexity": "int - method complexity, after commit", 157 | "nloc": "int - method lines of code, after commit", 158 | "fan_in": "int - method fan in, after commit", 159 | "fan_out": "int - method fan out, after commit", 160 | "general_fan_out": "int -method general fan out, after commit", 161 | "length": "int -method general fan out, after commit", 162 | "token_count": "int -method nr of tokens, after commit", 163 | "start_line": "int -method start line, after commit", 164 | "end_line": "int -method end line, after commit", 165 | } 166 | -------------------------------------------------------------------------------- /docs/source/driller.rst: -------------------------------------------------------------------------------- 1 | .. _DRILLERS: 2 | 3 | ================== 4 | Drillers 5 | ================== 6 | 7 | All Drillers parse a repository and insert it in Neo4j. 8 | Under the hood all drillers uses PyDriller to extract data from a repository. 9 | 10 | Drillers perform the following activities. 11 | Given a config file, they: 12 | 13 | * establish a connection to Neo4j (or raise an exception if the connection fails), 14 | * parse the data from PyDriller, 15 | * insert the data in Neo4j. 16 | 17 | 18 | Currently there are 3 drillers available: 19 | 20 | * Driller - default driller that stores the data parsed from the repository in RAM memory. 21 | * CacheDriller - stores the data parsed from the repository on disk (thus saving RAM memory at the cost of more disk writes and decreased performance). 22 | * QueueDriller - stores the data parsed from a repository to a queue. Currently it supports RabbitMQ and Artemis. Please take note that two drillers must be used in case of a queue: (i) one that parses the data from Git repos and (ii) one that indexes the data in Neo4j. 23 | The queue driller is the most scalable one since it allows to have multiple instances for indexing. Thus it solves some scalability issues (e.g., PyDriller is single threaded). 24 | 25 | In order to index the data, you will need a config file (see :ref:`CONFIGURATION`) and the 26 | following code:: 27 | 28 | from graphrepo.drillers.drillers import Driller 29 | 30 | # Initialize the database indexes 31 | try: 32 | driller.init_db() 33 | except Exception as exc: 34 | print("DB already initialized") 35 | 36 | # configure driller 37 | driller = Driller(config_path='path-to-yaml-config-file.yml') 38 | 39 | # drill (extract data and store it in Neo4j) 40 | driller.drill_batch() 41 | 42 | # merge duplicate nodes 43 | driller.merge_all() 44 | 45 | 46 | For a complete example, see :ref:`EXAMPLES`. 47 | 48 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | .. _EXAMPLES: 2 | 3 | ================== 4 | Examples 5 | ================== 6 | 7 | In the project's repository there are many examples on how to 8 | use GraphRepo to index and mine data. 9 | 10 | Please note that in order to run the plotting examples you have to install ``pandas`` and ``plotly``, for example using pip:: 11 | 12 | $ pip install pandas 13 | 14 | 1. Index data 15 | ============== 16 | 17 | In this example, we index all data from PyDriller in Neo4j. 18 | The example assumes you are running a Neo4j instance in Docker, as indicated in :ref:`CONFIGURATION`. 19 | 20 | In order to run the example, clone the projects using the following commands:: 21 | 22 | $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo 23 | $ cd graphrepo 24 | $ mkdir repos 25 | $ cd repos 26 | $ git clone https://github.com/ishepard/pydriller 27 | 28 | In this step we cloned the GraphRepo project, which includes the example scripts to run 29 | and the PyDriller project, which we want to experiment with. 30 | 31 | In order to run the indexing example, make sure to configure the config file in ``examples/configs/pydriller.yml`` 32 | and set the ``neo`` object to your database settings. 33 | 34 | Then run:: 35 | 36 | $ python -m examples.index_all --config=examples/config/pydriller.yml 37 | 38 | After indexing finishes, you can go to ``http://:7474/browser/`` 39 | and explore the project, with a query like: ``MATCH (n) RETURN n``. 40 | 41 | 42 | 2. Retrieve all data 43 | ===================== 44 | 45 | This step assumes you already indexed the PyDriller repository 46 | in Neo4j, as indicated at Step 1. 47 | In order to retrieve all information for PyDriller, we can run 48 | the following example:: 49 | 50 | $ python -m examples.mine_all --config=examples/config/pydriller.yml 51 | 52 | This script will print the number of nodes indexed in the database. 53 | 54 | 55 | 3. Plot file complexity over time 56 | =================================== 57 | 58 | This step assumes you already indexed the PyDriller repository 59 | in Neo4j, as indicated at Step 1. 60 | In this example we will use the miners to retrieve a file and 61 | plot its complexity evolution over time. 62 | The file used is ``examples/file_complexity.py``. 63 | The complexity is stored in the ``UpdateFile`` relationship (see Schema). 64 | The ``get_change_history`` from the ``File`` miner retrieves all the ``UpdateFile`` 65 | relationships that point to the file. 66 | 67 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly, 68 | although any other libraries can be used. 69 | 70 | In order to display the plot, run:: 71 | 72 | $ python -m examples.file_complexity --config=examples/configs/pydriller.yml 73 | 74 | 75 | 76 | 77 | 3. Plot file methods complexity over time 78 | ========================================== 79 | 80 | This step assumes you already indexed the PyDriller repository 81 | in Neo4j, as indicated at Step 1. 82 | In this example we will use the miners to retrieve and plot the complexity 83 | evolution over time of all methods in a file. 84 | The file used is ``examples/all_method_complexity.py``. 85 | The complexity is stored in the ``UpdateFile`` relationship (see Data Structure). 86 | We first get all the methods for a file, then, for each method, we get the 87 | update information as in Step 2. 88 | 89 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly, 90 | although any other libraries can be used. 91 | 92 | In order to display the plot, run:: 93 | 94 | $ python -m examples.all_method_complexity --config=examples/configs/pydriller.yml 95 | 96 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. GraphRepo documentation master file, created by 2 | sphinx-quickstart on Wed Jun 3 13:16:41 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | GraphRepo documentation 7 | ===================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | installation 13 | configuration 14 | architecture 15 | data_structure 16 | driller 17 | miners 18 | examples 19 | 20 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. _INSTALLATION: 2 | 3 | ======================== 4 | Overview & Installation 5 | ======================== 6 | 7 | GraphRepo is a tool that indexes Git repositories in Neo4j, and allows to query and aggregate the data. 8 | Under the hood it uses `PyDriller `_ to parse the data from a repository. 9 | 10 | Requirements 11 | ============ 12 | 13 | * Python 3.4 (or newer) 14 | * Neo4j 3 15 | * Docker (Optional) - we recommend to use Docker for Neo4j (as indicated below) 16 | 17 | Installation - using pip 18 | ========================= 19 | 20 | Assuming python and pip are installed, use: 21 | 22 | .. sourcecode:: none 23 | 24 | $ pip install graphrepo 25 | 26 | 27 | Installation - clone source code (dev version) 28 | =============================================== 29 | 30 | The latest development version can be cloned from Github:: 31 | 32 | $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo 33 | $ cd graphrepo 34 | 35 | 36 | Install the requirements: 37 | 38 | .. sourcecode:: none 39 | 40 | $ pip install -r requirements.txt 41 | 42 | Run a docker instance with Neo4j:: 43 | 44 | $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins -e NEO4JLABS_PLUGINS=\[\"apoc\"\] -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11 45 | 46 | Run the tests:: 47 | 48 | $ pytest 49 | 50 | 51 | Or see the :ref:`EXAMPLES`. -------------------------------------------------------------------------------- /docs/source/mappers.rst: -------------------------------------------------------------------------------- 1 | .. _MAPPERS: 2 | 3 | ================== 4 | Mappers 5 | ================== 6 | 7 | -------------------------------------------------------------------------------- /docs/source/miners.rst: -------------------------------------------------------------------------------- 1 | .. _MINERS: 2 | 3 | ================== 4 | Miners 5 | ================== 6 | 7 | Miners are special classes which hold default Neo4j queries that can be used to extract data. 8 | At the moment, there are 4 standard miners, specific to the most important node entities in the graph: 9 | 10 | * ``CommitMiner`` - default queries for commits (including relationships to other nodes), 11 | * ``DeveloperMiner`` - default queries for developers (including relationships to other nodes), 12 | * ``FileMiner`` - default queries for files (including relationships to other nodes), 13 | * ``MethodMiner`` - default queries for methods (including relationships to other nodes), 14 | 15 | and a ``MineManager``, which initializes and configures all miners. 16 | 17 | We recommend to always use the ``MineManager`` for initialization, since there is no overhead over initializing only one miner. 18 | Using a config file (see :ref:`CONFIGURATION`), the ``Minemanager`` can be initialized as follows:: 19 | 20 | from graphrepo.miners import MineManager 21 | 22 | # initialize mine manager 23 | miner = MineManager(config_path=args.config) 24 | 25 | # The specific miners can now be accessed as: 26 | miner.commit_miner.get_all() 27 | 28 | miner.dev_miner.get_all() 29 | 30 | miner.file_miner.get_all() 31 | 32 | miner.method_miner.get_all() -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/examples/__init__.py -------------------------------------------------------------------------------- /examples/all_method_complexity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module plots the method complexity evolution over time, for a file""" 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import pandas as pd 21 | import plotly.express as px 22 | 23 | from graphrepo.miners import MineManager 24 | 25 | from datetime import datetime 26 | 27 | 28 | def parse_args(): 29 | """Parse args""" 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | '--config', default='examples/configs/pydriller.yml', type=str) 33 | parser.add_argument('--plot', default=False, type=bool) 34 | return parser.parse_args() 35 | 36 | 37 | def main(): 38 | """Main""" 39 | args = parse_args() 40 | 41 | file_query = { 42 | 'hash': 'e2eb7bf414cebe68f46fa88e4abe9ae5813e91c4e1e97570f8e41cf4'} 43 | 44 | start = datetime.now() 45 | mine_manager = MineManager(config_path=args.config) 46 | 47 | methods = mine_manager.file_miner.get_current_methods(file_query['hash']) 48 | 49 | m_changes = [] 50 | for m in methods: 51 | changes = mine_manager.method_miner.get_change_history(m) 52 | mc = [{'complexity': x['complexity'], 53 | 'date': datetime.fromtimestamp(x['timestamp']), 54 | 'name': m['name']} for x in changes] 55 | m_changes = m_changes + mc 56 | print('All methods complexity took: {}'.format(datetime.now() - start)) 57 | print('Total methods: ', len(methods)) 58 | 59 | if args.plot: 60 | df = pd.DataFrame(m_changes) 61 | df['date'] = pd.to_datetime(df.date) 62 | df = df.sort_values(by='date') 63 | fig = px.line(df, x="date", y="complexity", color="name", 64 | line_group="name", hover_name="name") 65 | fig.show() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /examples/benchmarks/all_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import yaml 18 | from graphrepo.miners import MineManager 19 | from datetime import datetime 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 25 | return parser.parse_args() 26 | 27 | 28 | def main(): 29 | args = parse_args() 30 | 31 | start = datetime.now() 32 | miner = MineManager(config_path=args.config) 33 | 34 | # get all nodes and relationships from the manager 35 | nodes, rels = miner.get_all_data(map=False, merge=False) 36 | print("The DB has a total of {} nodes and {} relationships".format( 37 | len(nodes), len(rels))) 38 | print("All data took: {}".format(datetime.now() - start)) 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /examples/benchmarks/all_methods_complexity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import os 21 | import pandas as pd 22 | import plotly.express as px 23 | 24 | from graphrepo.miners import MineManager 25 | from graphrepo.utils import parse_config 26 | 27 | from datetime import datetime 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 33 | parser.add_argument('--plot', default=False, type=bool) 34 | return parser.parse_args() 35 | 36 | 37 | def main(): 38 | args = parse_args() 39 | 40 | if 'jax' in args.config: 41 | file_query = { 42 | 'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'} 43 | if 'hadoop' in args.config: 44 | file_query = { 45 | 'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'} 46 | if 'kibana' in args.config: 47 | file_query = { 48 | 'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48' 49 | } 50 | if 'tensorflow' in args.config: 51 | file_query = { 52 | 'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73' 53 | } 54 | 55 | start = datetime.now() 56 | mine_manager = MineManager(config_path=args.config) 57 | methods = mine_manager.file_miner.get_current_methods(file_query['hash']) 58 | 59 | m_changes = [] 60 | for m in methods: 61 | changes = mine_manager.method_miner.get_change_history(m['hash']) 62 | mc = [{'complexity': x['complexity'], 63 | 'date': datetime.fromtimestamp(x['timestamp']), 64 | 'name': m['name']} for x in changes] 65 | m_changes = m_changes + mc 66 | 67 | print('All methods complexity took: {}'.format(datetime.now() - start)) 68 | print('Total methods: ', len(methods)) 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /examples/benchmarks/dev_files.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import os 21 | import pandas as pd 22 | import plotly.express as px 23 | 24 | from datetime import datetime 25 | from graphrepo.miners import MineManager 26 | from graphrepo.utils import parse_config 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 32 | return parser.parse_args() 33 | 34 | 35 | def main(): 36 | args = parse_args() 37 | 38 | if 'jax' in args.config: 39 | dev_query = { 40 | 'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'} 41 | if 'hadoop' in args.config: 42 | dev_query = { 43 | 'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'} 44 | if 'kibana' in args.config: 45 | dev_query = { 46 | 'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'} 47 | 48 | if 'tensorflow' in args.config: 49 | dev_query = { 50 | 'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'} 51 | 52 | start = datetime.now() 53 | mine_manager = MineManager(config_path=args.config) 54 | files = mine_manager.dev_miner.get_files( 55 | dev_query['hash'], 56 | mine_manager.config.ct.project_id 57 | ) 58 | ft = [f['type'] for f in files] 59 | grouped = [{'file': x, 'count': len( 60 | [y for y in ft if x == y])} for x in set(ft)] 61 | 62 | print('Dev file types took {}'.format(datetime.now() - start)) 63 | print('Nr files', len(ft)) 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /examples/benchmarks/dev_methods.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | 21 | from datetime import datetime 22 | from graphrepo.miners import MineManager 23 | 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 28 | return parser.parse_args() 29 | 30 | 31 | def main(): 32 | args = parse_args() 33 | 34 | if 'jax' in args.config: 35 | dev_query = { 36 | 'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'} 37 | if 'hadoop' in args.config: 38 | dev_query = { 39 | 'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'} 40 | if 'kibana' in args.config: 41 | dev_query = { 42 | 'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'} 43 | if 'tensorflow' in args.config: 44 | dev_query = { 45 | 'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'} 46 | 47 | start = datetime.now() 48 | mine_manager = MineManager(config_path=args.config) 49 | method_updates = mine_manager.dev_miner.get_method_updates( 50 | dev_query['hash'], 51 | mine_manager.config.ct.project_id 52 | ) 53 | complexity = [c['complexity'] 54 | for c in method_updates if c['complexity'] != -1] 55 | _ = sum(complexity) / len(complexity) 56 | 57 | print('Dev file types took {}'.format(datetime.now() - start)) 58 | print('Nr method updates', len(method_updates)) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /examples/benchmarks/file_nloc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import os 21 | import pandas as pd 22 | import plotly.express as px 23 | 24 | from datetime import datetime 25 | from graphrepo.miners import MineManager 26 | from graphrepo.utils import parse_config 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 32 | parser.add_argument('--plot', default=False, type=bool) 33 | return parser.parse_args() 34 | 35 | 36 | def main(): 37 | args = parse_args() 38 | 39 | if 'jax' in args.config: 40 | file_query = { 41 | 'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'} 42 | if 'hadoop' in args.config: 43 | file_query = { 44 | 'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'} 45 | 46 | if 'kibana' in args.config: 47 | file_query = { 48 | 'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48' 49 | } 50 | if 'tensorflow' in args.config: 51 | file_query = { 52 | 'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73' 53 | } 54 | 55 | start = datetime.now() 56 | 57 | mine_manager = MineManager(config_path=args.config) 58 | updated_file_rels = mine_manager.file_miner.get_change_history( 59 | file_hash=file_query['hash']) 60 | nloc = [x['nloc'] for x in updated_file_rels] 61 | 62 | print('File nloc took {}'.format(datetime.now() - start)) 63 | print('File changes', len(updated_file_rels)) 64 | # print(updated_file_rels.data) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /examples/configs/graphrepo.yml: -------------------------------------------------------------------------------- 1 | 2 | neo: 3 | db_url: localhost 4 | port: 7687 5 | db_user: neo4j 6 | db_pwd: neo4jj 7 | batch_size: 50 8 | 9 | project: 10 | repo: repos/GraphRepo/ 11 | start_date: #"1 February, 2018" 12 | end_date: #"30 March, 2018" 13 | project_id: 'graphrepo' 14 | index_code: False 15 | index_developer_email: True -------------------------------------------------------------------------------- /examples/configs/grepo-test.yml: -------------------------------------------------------------------------------- 1 | 2 | neo: 3 | db_url: localhost 4 | port: 7687 5 | db_user: neo4j 6 | db_pwd: neo4jj 7 | batch_size: 50 8 | 9 | project: 10 | repo: repos/gr-testbench/ 11 | start_date: #"1 February, 2018" 12 | end_date: #"30 March, 2018" 13 | project_id: 'graphrepo-testbench' 14 | index_code: False 15 | index_developer_email: True -------------------------------------------------------------------------------- /examples/configs/hadoop.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 50 7 | 8 | project: 9 | repo: repos/hadoop/ 10 | start_date: "1 January, 2017 00:00" 11 | end_date: "1 January, 2018 00:00" 12 | project_id: hadoop 13 | index_code: True 14 | index_developer_email: True -------------------------------------------------------------------------------- /examples/configs/jax.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 50 7 | 8 | project: 9 | repo: repos/jax/ 10 | start_date: "1 January, 2019 00:00" 11 | end_date: "1 May, 2020 00:00" 12 | project_id: jax 13 | index_code: True 14 | index_developer_email: True -------------------------------------------------------------------------------- /examples/configs/kibana.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 50 7 | 8 | project: 9 | repo: repos/kibana/ 10 | start_date: "1 June, 2018 00:00" 11 | end_date: "1 June, 2019 00:00" 12 | project_id: kibana 13 | index_code: True 14 | index_developer_email: True -------------------------------------------------------------------------------- /examples/configs/pydriller.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 50 7 | 8 | project: 9 | repo: repos/pydriller/ 10 | start_date: #"1 February, 2018" 11 | end_date: #"30 March, 2018" 12 | project_id: 'pydriller' 13 | index_code: False 14 | index_developer_email: True 15 | -------------------------------------------------------------------------------- /examples/configs/tensorflow.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 50 7 | 8 | project: 9 | repo: repos/tensorflow/ 10 | start_date: "1 January, 2020 00:00" 11 | end_date: "1 March, 2020 00:00" 12 | project_id: 'tensorflow' 13 | index_code: True 14 | index_developer_email: True -------------------------------------------------------------------------------- /examples/dev_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import os 21 | import pandas as pd 22 | import plotly.express as px 23 | 24 | from datetime import datetime 25 | from graphrepo.miners import MineManager 26 | from graphrepo.utils import parse_config 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | '--config', default='examples/configs/pydriller.yml', type=str) 33 | return parser.parse_args() 34 | 35 | 36 | def main(): 37 | args = parse_args() 38 | mine_manager = MineManager(config_path=args.config) 39 | files = mine_manager.dev_miner.get_files( 40 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0", 41 | mine_manager.config.ct.project_id 42 | ) 43 | print(len(files), ' files') 44 | 45 | file_updates = mine_manager.dev_miner.get_files_updates( 46 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0", 47 | mine_manager.config.ct.project_id 48 | ) 49 | print(len(file_updates), ' file updates') 50 | 51 | methods = mine_manager.dev_miner.get_methods( 52 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0", 53 | mine_manager.config.ct.project_id 54 | ) 55 | print(len(methods), ' methods') 56 | 57 | method_updates = mine_manager.dev_miner.get_method_updates( 58 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0", 59 | mine_manager.config.ct.project_id 60 | ) 61 | print(len(method_updates), ' method updates') 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /examples/file_complexity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### 17 | # This file assumes the project from the config file was already indexed 18 | ### 19 | import argparse 20 | import os 21 | import pandas as pd 22 | import plotly.express as px 23 | 24 | from datetime import datetime 25 | from graphrepo.miners import MineManager 26 | from graphrepo.utils import parse_config 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str) 32 | return parser.parse_args() 33 | 34 | 35 | def main(): 36 | args = parse_args() 37 | mine_manager = MineManager(config_path=args.config) 38 | 39 | file_miner = mine_manager.file_miner 40 | file_ = file_miner.query(pproject_id=mine_manager.config.ct.project_id, 41 | name="commit.py") 42 | updated_file_rels = file_miner.get_change_history(file_['hash']) 43 | 44 | # sort update relationships and transform data for plotting 45 | updated_file_rels.sort(key=lambda x: x['timestamp']) 46 | 47 | complexity = [x['complexity'] for x in updated_file_rels] 48 | nloc = [x['nloc'] for x in updated_file_rels] 49 | dts = [datetime.fromtimestamp(x['timestamp']) for x in updated_file_rels] 50 | 51 | fig = px.line(pd.DataFrame({'date': dts, 'complexity': complexity}), 52 | x='date', y='complexity', 53 | title='Complexity over time for the commit.py file') 54 | fig.show() 55 | 56 | fig_2 = px.line(pd.DataFrame({'date': dts, 'nloc': nloc}), 57 | x='date', y='nloc', title="NLOC over time for the commit.py file") 58 | fig_2.show() 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /examples/index_all.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module is an example of indexing all data from a repository in Neo4j""" 15 | 16 | import argparse 17 | from graphrepo.drillers import Driller 18 | 19 | 20 | def parse_args(): 21 | """Parse argument""" 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | '--config', default='examples/configs/pydriller.yml', type=str) 25 | return parser.parse_args() 26 | 27 | 28 | def main(): 29 | """Main method""" 30 | args = parse_args() 31 | driller = Driller(config_path=args.config) 32 | # this method should be called only once, when initializing 33 | # a database for the first time 34 | try: 35 | driller.init_db() 36 | except Exception as exc: 37 | print("DB already initialized") 38 | driller.drill_batch() 39 | driller.merge_all() 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /examples/mine_all.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import yaml 18 | from graphrepo.miners import MineManager 19 | from datetime import datetime 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | '--config', default='examples/configs/pydriller.yml', type=str) 26 | return parser.parse_args() 27 | 28 | 29 | def main(): 30 | args = parse_args() 31 | 32 | start = datetime.now() 33 | miner = MineManager(config_path=args.config) 34 | 35 | # get all nodes and relationships from the manager 36 | nodes, rels = miner.get_all_data() 37 | print("The DB has a total of {} nodes and {} relationships".format( 38 | len(nodes), len(rels))) 39 | print("All data took: {}".format(datetime.now() - start)) 40 | 41 | # get all commits 42 | commits = miner.commit_miner.get_all() 43 | print("The DB has a total of {} commits".format(len(commits))) 44 | 45 | # get all developers 46 | devs = miner.dev_miner.get_all() 47 | print("The DB has a total of {} developers".format(len(devs))) 48 | 49 | # get all files 50 | files = miner.file_miner.get_all() 51 | print("The DB has a total of {} files".format(len(files))) 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /graphrepo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /graphrepo/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """This module stores all config constants. It is a singleton 16 | because it is used across several modules inside the app""" 17 | 18 | from graphrepo.singleton import Singleton 19 | from graphrepo.utils import Dotdict 20 | 21 | 22 | class Config(metaclass=Singleton): 23 | """This class contains all config flags""" 24 | ct = {} 25 | 26 | def configure(self, **kwargs): 27 | """Stores configuration contants, parsed 28 | from yaml config file 29 | :param kwargs: keys and values from config 30 | """ 31 | self.ct = Dotdict(kwargs) 32 | 33 | def check_config(self): 34 | """Checks if the config properties are set and 35 | raises ValueError if any value misses""" 36 | 37 | if not self.ct.db_url or not self.ct.port \ 38 | or not self.ct.db_user or not self.ct.db_pwd: 39 | raise ValueError("Neo4j configuartion is invalid.") 40 | -------------------------------------------------------------------------------- /graphrepo/drillers/__init__.py: -------------------------------------------------------------------------------- 1 | from .driller import * 2 | from .cache_driller import * 3 | from .stomp_driller import * 4 | from .queue_driller import * 5 | -------------------------------------------------------------------------------- /graphrepo/drillers/batch_utils.py: -------------------------------------------------------------------------------- 1 | """This module is the wild wild west of batch indexing :-) 2 | In contains all Neo4j queries for indexing the data in batches. 3 | More documentation will follow soon. 4 | """ 5 | from datetime import datetime 6 | 7 | 8 | def batch(iterable, n=1): 9 | l = len(iterable) 10 | for ndx in range(0, l, n): 11 | yield iterable[ndx:min(ndx + n, l)] 12 | 13 | 14 | def index_commits(graph, commits, batch_size=100): 15 | query = """ 16 | UNWIND {commits} AS c 17 | MERGE (nc :Commit { hash: c.hash}) 18 | ON CREATE SET 19 | nc = c 20 | ON MATCH SET 21 | nc = c 22 | """ 23 | for b in batch(commits, batch_size): 24 | graph.run(query, commits=b) 25 | 26 | 27 | def index_parent_commits(graph, parents, batch_size=100): 28 | query = """ 29 | UNWIND {ac} AS a 30 | MATCH (x:Commit),(y:Commit) 31 | WHERE x.hash = a.parent_hash AND y.hash = a.child_hash 32 | MERGE (x)-[r:Parent{}]->(y) 33 | """ 34 | for b in batch(parents, batch_size): 35 | graph.run(query, ac=b) 36 | 37 | 38 | def index_authors(graph, authors, batch_size=100): 39 | query = """ 40 | UNWIND {authors} AS a 41 | MERGE (nd:Developer { hash: a.hash}) 42 | ON CREATE SET nd = a 43 | ON MATCH SET nd = a 44 | """ 45 | for b in batch(authors, batch_size): 46 | graph.run(query, authors=b) 47 | 48 | 49 | def index_branches(graph, branches, batch_size=100): 50 | query = """ 51 | UNWIND {branches} AS a 52 | MERGE (nb:Branch { hash: a.hash}) 53 | ON CREATE SET nb = a 54 | ON MATCH SET nb = a 55 | """ 56 | for b in batch(branches, batch_size): 57 | graph.run(query, branches=b) 58 | 59 | 60 | def index_branch_commits(graph, bc, batch_size=100): 61 | query = """ 62 | UNWIND {ac} AS a 63 | MATCH (x:Branch),(y:Commit) 64 | WHERE x.hash = a.branch_hash AND y.hash = a.commit_hash 65 | MERGE (x)-[r:BranchCommit{}]->(y) 66 | """ 67 | for b in batch(bc, batch_size): 68 | graph.run(query, ac=b) 69 | 70 | 71 | def index_files(graph, files, batch_size=100): 72 | query = """ 73 | UNWIND {files} AS f 74 | MERGE (nf:File { hash: f.hash}) 75 | ON CREATE SET nf = f 76 | ON MATCH SET nf = f 77 | """ 78 | for b in batch(files, batch_size): 79 | graph.run(query, files=b) 80 | 81 | 82 | def index_methods(graph, methods, batch_size=100): 83 | query = """ 84 | UNWIND {methods} AS f 85 | MERGE (nm:Method { hash: f.hash}) 86 | ON CREATE SET nm = f 87 | ON MATCH SET nm = f 88 | """ 89 | 90 | for b in batch(methods, batch_size): 91 | graph.run(query, methods=b) 92 | 93 | 94 | def index_author_commits(graph, ac, batch_size=100): 95 | query = """ 96 | UNWIND {ac} AS a 97 | MATCH (x:Developer),(y:Commit) 98 | WHERE x.hash = a.author_hash AND y.hash = a.commit_hash 99 | MERGE (x)-[r:Author{timestamp: a.timestamp}]->(y) 100 | """ 101 | for b in batch(ac, batch_size): 102 | graph.run(query, ac=b) 103 | 104 | 105 | def index_commit_files(graph, cf, batch_size=100): 106 | query = """ 107 | UNWIND {cf} AS a 108 | MATCH (x:Commit),(y:File) 109 | WHERE x.hash = a.commit_hash AND y.hash = a.file_hash 110 | MERGE (x)-[r:UpdateFile{}]->(y) 111 | ON CREATE SET r=a['attributes'] 112 | """ 113 | for i, b in enumerate(batch(cf, batch_size)): 114 | graph.run(query, cf=b) 115 | 116 | 117 | def index_file_methods(graph, cf, batch_size=100): 118 | query = """ 119 | UNWIND {cf} AS a 120 | MATCH (x:File),(y:Method) 121 | WHERE x.hash = a.file_hash AND y.hash = a.method_hash 122 | MERGE (x)-[r:Method{}]->(y) 123 | """ 124 | for b in batch(cf, batch_size): 125 | graph.run(query, cf=b) 126 | 127 | 128 | def index_commit_method(graph, cm, batch_size=100): 129 | query = """ 130 | UNWIND {cf} AS a 131 | MATCH (x:Commit),(y:Method) 132 | WHERE x.hash = a.commit_hash AND y.hash = a.method_hash 133 | MERGE (x)-[r:UpdateMethod]->(y) 134 | ON CREATE SET r=a['attributes'] 135 | """ 136 | for i, b in enumerate(batch(cm, batch_size)): 137 | graph.run(query, cf=b) 138 | 139 | 140 | def create_index_authors(graph): 141 | query = """ 142 | CREATE INDEX ON :Developer(hash) 143 | """ 144 | graph.run(query) 145 | 146 | 147 | def create_index_commits(graph, hash=True): 148 | if hash: 149 | hash_q = """ 150 | CREATE INDEX ON :Commit(hash) 151 | """ 152 | graph.run(hash_q) 153 | 154 | pid_q = """ 155 | CREATE INDEX ON :Commit(project_id) 156 | """ 157 | 158 | graph.run(pid_q) 159 | 160 | 161 | def create_index_branches(graph, hash=True): 162 | if hash: 163 | hash_q = """ 164 | CREATE INDEX ON :Branch(hash) 165 | """ 166 | graph.run(hash_q) 167 | 168 | pid_q = """ 169 | CREATE INDEX ON :Branch(project_id) 170 | """ 171 | graph.run(pid_q) 172 | 173 | 174 | def create_index_files(graph, hash=True): 175 | if hash: 176 | hash_q = """ 177 | CREATE INDEX ON :File(hash) 178 | """ 179 | graph.run(hash_q) 180 | 181 | mhash_q = """ 182 | CREATE INDEX ON :File(merge_hash) 183 | """ 184 | graph.run(mhash_q) 185 | 186 | pid_q = """ 187 | CREATE INDEX ON :File(project_id) 188 | """ 189 | graph.run(pid_q) 190 | 191 | 192 | def create_index_methods(graph, hash=True): 193 | if hash: 194 | hash_q = """ 195 | CREATE INDEX ON :Method(hash) 196 | """ 197 | graph.run(hash_q) 198 | 199 | mhash_q = """ 200 | CREATE INDEX ON :Method(merge_hash) 201 | """ 202 | graph.run(mhash_q) 203 | 204 | pid_q = """ 205 | CREATE INDEX ON :Method(project_id) 206 | """ 207 | graph.run(pid_q) 208 | 209 | 210 | def merge_renamed_files(graph, project_id): 211 | query = """ 212 | MATCH (n1:File),(n2:File) 213 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.merge_hash and id(n1) < id(n2) 214 | WITH [n1,n2] as ns 215 | order by id(ns[1]) desc 216 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node 217 | MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf 218 | with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows 219 | unwind allRows as row 220 | match (mu: Method {{hash: row.hash}}) 221 | SET mu.merge_hash = row.new_hash""".format(project_id) 222 | graph.run(query) 223 | 224 | def merge_new_files(graph, project_id): 225 | query = """ 226 | MATCH (n1:File),(n2:File) 227 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.hash and id(n1) < id(n2) 228 | WITH [n1,n2] as ns 229 | order by id(ns[1]) desc 230 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node 231 | MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf 232 | with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows 233 | unwind allRows as row 234 | match (mu: Method {{hash: row.hash}}) 235 | SET mu.merge_hash = row.new_hash 236 | """.format(project_id) 237 | graph.run(query) 238 | 239 | 240 | def merge_methods(graph, project_id): 241 | query = """ 242 | MATCH (n1:Method),(n2:Method) 243 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}" 244 | and n1.file_name = n2.file_name and n1.name = n2.name and n1.project_id = n2.project_id and n1.merge_hash = n2.merge_hash and id(n1) < id(n2) 245 | WITH [n1,n2] as ns 246 | order by id(ns[1]) desc 247 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node 248 | return node 249 | """.format(project_id) 250 | graph.run(query) 251 | 252 | 253 | def merge_files(graph, config): 254 | print('Merging moved files and methods') 255 | start = datetime.now() 256 | merge_renamed_files(graph, config.project_id) 257 | merge_methods(graph, config.project_id) 258 | merge_new_files(graph, config.project_id) 259 | merge_methods(graph, config.project_id) 260 | print('Merged files and methods \t', datetime.now()-start) 261 | 262 | def index_all(graph, developers, commits, parents, dev_commits, branches, 263 | branches_commits, files, commit_files, methods, file_methods, 264 | commit_methods, config): 265 | 266 | total = datetime.now() 267 | 268 | batch_size = config.batch_size 269 | 270 | developers = list({v['hash']: v for v in developers}.values()) 271 | print('Indexing ', len(developers), ' authors') 272 | start = datetime.now() 273 | index_authors(graph, developers, batch_size) 274 | print('Indexed authors in: \t', datetime.now()-start) 275 | 276 | print('Indexing ', len(commits), ' commits') 277 | start = datetime.now() 278 | index_commits(graph, commits, batch_size) 279 | print('Indexed commits in: \t', datetime.now()-start) 280 | 281 | branches = list({v['hash']: v for v in branches}.values()) 282 | branches_commits = list({str(i): i for i in branches_commits}.values()) 283 | print('Indexing ', len(branches), ' branches') 284 | start = datetime.now() 285 | index_branches(graph, branches, batch_size) 286 | index_branch_commits(graph, branches_commits, batch_size) 287 | print('Indexed branches in: \t', datetime.now()-start) 288 | 289 | files = list({v['hash']: v for v in files}.values()) 290 | print('Indexing ', len(files), ' files') 291 | start = datetime.now() 292 | index_files(graph, files, batch_size) 293 | print('Indexed files in: \t', datetime.now()-start) 294 | 295 | methods = list({v['hash']: v for v in methods}.values()) 296 | print('Indexing ', len(methods), ' methods') 297 | start = datetime.now() 298 | index_methods(graph, methods, batch_size) 299 | print('Indexed methods in: \t', datetime.now()-start) 300 | 301 | parents = list({str(i): i for i in parents}.values()) 302 | print('Indexing ', len(parents), ' parent commits') 303 | start = datetime.now() 304 | index_parent_commits(graph, parents, batch_size) 305 | print('Indexed commits in: \t', datetime.now()-start) 306 | 307 | print('Indexing ', len(dev_commits), ' author_commits') 308 | start = datetime.now() 309 | index_author_commits(graph, dev_commits, batch_size) 310 | print('Indexed author_commits in: \t', datetime.now()-start) 311 | 312 | file_methods = list({str(i): i for i in file_methods}.values()) 313 | print('Indexings ', len(file_methods), ' file_methods') 314 | start = datetime.now() 315 | index_file_methods(graph, file_methods, batch_size) 316 | print('Indexed file_methods in: \t', datetime.now()-start) 317 | 318 | print('Indexing ', len(commit_methods), ' commit_methods') 319 | start = datetime.now() 320 | index_commit_method(graph, commit_methods, batch_size) 321 | print('Indexed commit_methods in: \t', datetime.now()-start) 322 | 323 | print('Indexing ', len(commit_files), ' commit_files') 324 | start = datetime.now() 325 | index_commit_files(graph, commit_files, batch_size) 326 | print('Indexed commit_files in: \t', datetime.now()-start) 327 | print('Indexing took: \t', datetime.now()-total) 328 | 329 | 330 | def index_cache(graph, cache, config): 331 | batch_size = config.batch_size 332 | total = datetime.now() 333 | index_authors(graph, list( 334 | {v['hash']: v for v in cache.data['developers']}.values()), batch_size) 335 | index_commits(graph, cache.data['commits'], batch_size) 336 | index_branches(graph, list( 337 | {v['hash']: v for v in cache.data['branches']}.values()), batch_size) 338 | index_branch_commits(graph, list( 339 | {str(i): i for i in cache.data['branches_commits']}.values()), batch_size) 340 | index_files(graph, list( 341 | {v['hash']: v for v in cache.data['files']}.values()), batch_size) 342 | index_methods(graph, list( 343 | {v['hash']: v for v in cache.data['methods']}.values()), batch_size) 344 | index_parent_commits(graph, list( 345 | {str(i): i for i in cache.data['parents']}.values()), batch_size) 346 | index_author_commits(graph, cache.data['dev_commits'], batch_size) 347 | index_file_methods(graph, list( 348 | {str(i): i for i in cache.data['file_methods']}.values()), batch_size) 349 | index_commit_method(graph, cache.data['commit_methods'], batch_size) 350 | index_commit_files(graph, cache.data['commit_files'], batch_size) 351 | print('Indexing took: \t', datetime.now()-total) 352 | -------------------------------------------------------------------------------- /graphrepo/drillers/cache_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ This module uses pydriller to search a repository 16 | and indexes it in neo4j 17 | """ 18 | from datetime import datetime 19 | from pydriller import RepositoryMining 20 | 21 | import graphrepo.utils as utl 22 | import graphrepo.drillers.batch_utils as b_utl 23 | from graphrepo.drillers.drill_cache import DrillCache, DrillCacheSequential 24 | from graphrepo.drillers.default import DefaultDriller 25 | from graphrepo.logger import Logger 26 | 27 | LG = Logger() 28 | 29 | 30 | class CacheDriller(DefaultDriller): 31 | """CacheDriller class - parses a git repo and uses the models 32 | to index everything in Neo4j by storing all data on disk. 33 | """ 34 | 35 | def drill_batch_cache_sequential(self, index=True): 36 | """Extracts all information from a git repository 37 | and it stores in in a disk cache 38 | :param index: optional, if True, the data is indexed in Neo4j 39 | :returns: cache with all data 40 | """ 41 | start = datetime.now() 42 | print('Driller started at: \t', start) 43 | cache = DrillCacheSequential() 44 | for commit in \ 45 | RepositoryMining(self.config.ct.repo, 46 | since=self.config.ct.start_date, 47 | to=self.config.ct.end_date).traverse_commits(): 48 | timestamp = commit.author_date.timestamp() 49 | dev = utl.format_dev(commit, self.config.ct.index_developer_email) 50 | cache.append_cache('developers', dev) 51 | com = utl.format_commit(commit, self.config.ct.project_id) 52 | cache.append_cache('commits', com) 53 | cache.append_cache( 54 | 'dev_commits', 55 | utl.format_author_commit(dev, com, timestamp)) 56 | for parent in commit.parents: 57 | cache.append_cache('parents', utl.format_parent_commit( 58 | com['hash'], parent, self.config.ct.project_id)) 59 | for branch in commit.branches: 60 | br_ = utl.format_branch(branch, self.config.ct.project_id) 61 | cache.append_cache('branches', br_) 62 | cache.append_cache('branches_commits', utl.format_branch_commit( 63 | br_['hash'], com['hash'])) 64 | for file in commit.modifications: 65 | fl_ = utl.format_file(file, self.config.ct.project_id) 66 | cache.append_cache('files', fl_) 67 | cache.append_cache('commit_files', utl.format_commit_file( 68 | com['hash'], file, timestamp, self.config.ct.project_id)) 69 | for method in file.changed_methods: 70 | met = utl.format_method( 71 | method, file, self.config.ct.project_id) 72 | cache.append_cache('methods', met) 73 | cache.append_cache( 74 | 'file_methods', 75 | utl.format_file_method(fl_['hash'], 76 | met['hash'])) 77 | cache.append_cache('commit_methods', 78 | utl.format_commit_method( 79 | com['hash'], 80 | met['hash'], 81 | method, 82 | timestamp)) 83 | print('Driller finished in: \t', datetime.now() - start) 84 | if index: 85 | self.index_batch(cache) 86 | return cache 87 | 88 | def index_batch(self, cache): 89 | """Indexes cached data to Neo4j 90 | :param cache: diskcache Cache or Index 91 | """ 92 | try: 93 | self.config.check_config() 94 | self._check_connection() 95 | b_utl.index_cache( 96 | self.graph, cache, config=self.config.ct) 97 | except Exception as exc: 98 | LG.log_and_raise(exc) 99 | else: 100 | return 101 | 102 | def drill_batch_cache_all(self, index=True): 103 | """Extracts the information from a repository in memory 104 | and caches it after the extraction 105 | :param index: optional, if True, the data is indexed in Neo4j 106 | """ 107 | data = self.drill_batch(index=False) 108 | cache = DrillCache(data) 109 | if index: 110 | self.index_batch(cache) 111 | return cache 112 | -------------------------------------------------------------------------------- /graphrepo/drillers/db_init.py: -------------------------------------------------------------------------------- 1 | """This module initializes the Neo4j indexes""" 2 | import graphrepo.drillers.batch_utils as utils 3 | 4 | 5 | def create_hash_constraints(graph): 6 | """Creates uniqueness constratins on nodes' hash""" 7 | query = """CREATE CONSTRAINT ON (n: {}) ASSERT n.hash IS UNIQUE""" 8 | nodes = ["Developer", "Branch", "Commit", "File", "Method"] 9 | for node in nodes: 10 | fquery = query.format(node) 11 | graph.run(fquery) 12 | 13 | 14 | def create_indices(graph, hash_index=True): 15 | """Initializes all indexes for database""" 16 | if hash_index: 17 | utils.create_index_authors(graph) 18 | utils.create_index_branches(graph, hash_index) 19 | utils.create_index_commits(graph, hash_index) 20 | utils.create_index_files(graph, hash_index) 21 | utils.create_index_methods(graph, hash_index) 22 | -------------------------------------------------------------------------------- /graphrepo/drillers/default.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 GraphRepo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Default Parent class for drillers 16 | """ 17 | from abc import abstractmethod 18 | from datetime import datetime 19 | from py2neo import Graph 20 | from pydriller import RepositoryMining 21 | 22 | import graphrepo.utils as utl 23 | import graphrepo.drillers.batch_utils as b_utl 24 | import graphrepo.drillers.db_init as db_init 25 | from graphrepo.config import Config 26 | from graphrepo.logger import Logger 27 | LG = Logger() 28 | 29 | 30 | class DefaultDriller(): 31 | """DefaultDriller class - parses a git repo and uses the models 32 | to index everything in Neo4j. 33 | """ 34 | 35 | def __init__(self, config_path): 36 | """Initializes the properties of this class 37 | :param config_path: path to yml config file 38 | """ 39 | try: 40 | if not config_path: 41 | raise FileNotFoundError 42 | neo, project = utl.parse_config(config_path) 43 | self.config = Config() 44 | self.graph = None 45 | self.config.configure(**neo, **project) 46 | self._connect() 47 | except Exception as exc: 48 | LG.log_and_raise(exc) 49 | 50 | def _connect(self): 51 | """Instantiates the connection to Neo4j and stores 52 | the graph internally. 53 | Throws exception if the connection can not pe realized 54 | """ 55 | try: 56 | self.graph = Graph(host=self.config.ct.db_url, 57 | user=self.config.ct.db_user, 58 | password=self.config.ct.db_pwd, 59 | port=self.config.ct.port) 60 | except Exception as exc: 61 | LG.log_and_raise(exc) 62 | 63 | def _check_connection(self): 64 | """Checks if there is a db connection and raises 65 | ReferenceError if not. 66 | """ 67 | try: 68 | self._connect() 69 | except: 70 | raise ReferenceError("There is no valid " 71 | "database connection. Please " 72 | "configure and connect first.") 73 | 74 | def init_db(self): 75 | """Runs initialization of a database; creates 76 | constraints and indexes""" 77 | try: 78 | self._check_connection() 79 | db_init.create_hash_constraints(self.graph) 80 | db_init.create_indices(self.graph, hash_index=False) 81 | except Exception as exc: 82 | raise exc 83 | 84 | def clean(self): 85 | """Removes all data in a graph 86 | """ 87 | try: 88 | self.config.check_config() 89 | self._check_connection() 90 | 91 | self.graph.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r") 92 | except Exception as exc: 93 | LG.log_and_raise(exc) 94 | 95 | def drill_batch(self, index=True, save_path=None): 96 | """Extracts data from a software repository, with the option 97 | of saving it on diks and indexing it in Neo4j 98 | :param index: optional, if True, the data is indexed in Neo4j 99 | :param save_path: optional, if given, the data is stored on dik 100 | :returns: dictionary with all data 101 | """ 102 | start = datetime.now() 103 | print('Driller started at: \t', start) 104 | commits, parents, devs, dev_com, branches,\ 105 | branches_com, files, com_files, \ 106 | methods, files_methods, com_methods = \ 107 | [], [], [], [], [], [], [], [], [], [], [] 108 | for commit in \ 109 | RepositoryMining(self.config.ct.repo, 110 | since=self.config.ct.start_date, 111 | to=self.config.ct.end_date).traverse_commits(): 112 | self.drill_commit(commit, commits, parents, devs, dev_com, branches, 113 | branches_com, files, com_files, 114 | methods, files_methods, com_methods) 115 | 116 | data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches, 117 | branches_com, files, com_files, 118 | methods, files_methods, com_methods) 119 | 120 | print('Driller finished in: \t', datetime.now() - start) 121 | 122 | if save_path: 123 | utl.save_json(save_path, data_) 124 | if index: 125 | self.index_batch(**data_) 126 | return data_ 127 | 128 | def drill_commit(self, commit, commits, parents, devs, dev_com, branches, 129 | branches_com, files, com_files, 130 | methods, files_methods, com_methods): 131 | """Helper method - works with pass by reference""" 132 | timestamp = commit.author_date.timestamp() 133 | dev = utl.format_dev(commit, self.config.ct.index_developer_email) 134 | devs.append(dev) 135 | com = utl.format_commit(commit, self.config.ct.project_id) 136 | commits.append(com) 137 | dev_com.append(utl.format_author_commit(dev, com, timestamp)) 138 | for parent in commit.parents: 139 | parents.append(utl.format_parent_commit( 140 | com['hash'], parent, self.config.ct.project_id)) 141 | for branch in commit.branches: 142 | br_ = utl.format_branch(branch, self.config.ct.project_id) 143 | branches.append(br_) 144 | branches_com.append( 145 | utl.format_branch_commit(br_['hash'], com['hash'])) 146 | for file in commit.modifications: 147 | fl_ = utl.format_file(file, self.config.ct.project_id) 148 | files.append(fl_) 149 | com_files.append(utl.format_commit_file( 150 | com['hash'], file, 151 | timestamp, self.config.ct.project_id, self.config.ct.index_code)) 152 | for method in file.changed_methods: 153 | met = utl.format_method( 154 | method, file, self.config.ct.project_id) 155 | methods.append(met) 156 | files_methods.append( 157 | utl.format_file_method(fl_['hash'], met['hash']) 158 | ) 159 | com_methods.append( 160 | utl.format_commit_method(com['hash'], met['hash'], 161 | method, timestamp)) 162 | 163 | def data_dot_dict(self, commits, parents, devs, dev_com, branches, 164 | branches_com, files, com_files, 165 | methods, files_methods, com_methods): 166 | """Helper method""" 167 | return utl.Dotdict({'commits': commits, 168 | 'parents': parents, 169 | 'developers': devs, 170 | 'dev_commits': dev_com, 171 | 'branches': branches, 172 | 'branches_commits': branches_com, 173 | 'files': files, 174 | 'commit_files': com_files, 175 | 'methods': methods, 176 | 'file_methods': files_methods, 177 | 'commit_methods': com_methods}) 178 | 179 | @abstractmethod 180 | def index_batch(self): 181 | """Abstract index batch driller method 182 | """ 183 | raise NotImplementedError 184 | 185 | 186 | def merge_all(self): 187 | """Merges file renaming and methods""" 188 | try: 189 | b_utl.merge_files(self.graph, self.config.ct) 190 | except Exception as exc: 191 | LG.log_and_raise(exc) 192 | else: 193 | return 194 | -------------------------------------------------------------------------------- /graphrepo/drillers/delete_all.py: -------------------------------------------------------------------------------- 1 | # def delete_all(): 2 | # # get total #of nodes 3 | # res = session.run("MATCH(n) RETURN COUNT(*) AS n") 4 | # total_nodes = 0 5 | # for item in res: 6 | # total_nodes = item["n"] 7 | # print("\n Existing nodes in db:", total_nodes) 8 | 9 | # # get total #of relationships 10 | # res1 = session.run("MATCH (n)-[r]->() RETURN COUNT(r) as r") 11 | # total_rels = 0 12 | # for item in res1: 13 | # total_rels = item["r"] 14 | # print("\n Existing relationships in db:", total_rels) 15 | 16 | # # delete all nodes in batches (for faster deletion) 17 | # while total_nodes > 0: 18 | # res = session.run( 19 | # "MATCH(n) WITH n LIMIT 10000 DETACH DELETE n RETURN COUNT(n) AS count") 20 | # count = 0 21 | # for item in res: 22 | # count = item["count"] # updates deleeted node count here 23 | # total_nodes = total_nodes-count 24 | # print("\n #of nodes in db after deletion completed = ", total_nodes) 25 | 26 | 27 | # start = time.time() 28 | # delete_all() 29 | # print("\n Pre cleanup time (sec): ", time.time()-start) 30 | 31 | # for prot in fileList: 32 | # print("\n\n", prot) 33 | # if os.path.exists(prot+"_AllCCs_maxDist11.csv"): 34 | # print("\n Already Processed.") 35 | # continue 36 | # start = time.time() 37 | # delete_all() 38 | # pre_time = time.time()-start 39 | # print("\n Pre cleanup time (sec): ", pre_time) 40 | 41 | # # Database preparation 42 | # session.run("CREATE INDEX ON :MyNode(Name)") 43 | 44 | # # 1. Create graph 45 | # start = time.time() 46 | # session.run("USING PERIODIC COMMIT " 47 | # "LOAD CSV FROM 'file:///'+{prot}+'_conflict_resolved.txt' AS line " 48 | # "MERGE (n:MyNode {Name:line[0]}) " 49 | # "MERGE (m:MyNode {Name:line[1]}) " 50 | # "MERGE (n) -[:TO {dist:line[2]}] -> (m) ", prot=prot) 51 | 52 | # end = time.time() 53 | # step1_time = end - start 54 | # print("\n Step 1 time (in sec) = ", end-start) 55 | 56 | # # 2 find CCs 57 | # start = time.time() 58 | # result = session.run("CALL algo.unionFind.stream('MyNode', 'TO', {graph:'huge'}) " 59 | # "YIELD nodeId,setId " 60 | # "MATCH (n) " 61 | # "WHERE id(n)=nodeId " 62 | # "WITH setId,collect(nodeId) as nodes, collect(n.Name) as labels,count(*) as size_of_component " 63 | # "ORDER BY size_of_component DESC " 64 | # "RETURN setId as componentId,size_of_component,labels as connectedTSRkeys ") 65 | # end = time.time() 66 | # step2_time = end - start 67 | # print("\n Step 2 time (in sec) = ", end-start) 68 | # # 3. save result 69 | # start = time.time() 70 | # # newline='' <- to avoid blank line between two rows 71 | # with open(prot+"_AllCCs_maxDist11.csv", "w") as csvfile: 72 | # writer = csv.writer(csvfile, delimiter=',') 73 | # writer.writerow( 74 | # ['componentId', 'size_of_component', 'connectedTSRkeys']) 75 | # for record in result: 76 | # record = str(record)[:-1].replace(", ", 77 | # ",").replace("'", "").split() 78 | # print("\n", record[1], record[2], record[3]) 79 | # writer.writerow([record[1].split("=")[1], record[2].split("=")[ 80 | # 1], record[3].split("=")[1]]) 81 | # end = time.time() 82 | # step3_time = end - start 83 | # print("\n Step 3 time (in sec) = ", end-start) 84 | 85 | # # 4. delete graph 86 | # start = time.time() 87 | # delete_all() 88 | # end = time.time() 89 | # post_time = end - start 90 | # print("\n Post cleanup time (in sec) = ", end-start) 91 | 92 | # print("\n Total time = ", pre_time+step1_time + 93 | # step2_time+step3_time+post_time) 94 | 95 | # driver.close() 96 | -------------------------------------------------------------------------------- /graphrepo/drillers/drill_cache.py: -------------------------------------------------------------------------------- 1 | """This module saves the cache data on disk""" 2 | import collections 3 | from diskcache import Index 4 | 5 | 6 | class DrillCache: 7 | """Class for storing all data at once in the cache""" 8 | 9 | def __init__(self, data): 10 | """Transforms dictionary to ordered dic and saves it""" 11 | dt_ = [(k, v) for k, v in data.items()] 12 | self.data = Index(collections.OrderedDict(dt_)) 13 | 14 | 15 | class DrillCacheSequential: 16 | """Class for disk cache sequential""" 17 | 18 | def __init__(self): 19 | """Init drill cache""" 20 | self.data = Index([('commits', []), 21 | ('parents', []), ('developers', []), 22 | ('dev_commits', []), ('branches', []), 23 | ('branches_commits', []), ('files', []), 24 | ('commit_files', []), ('methods', []), 25 | ('file_methods', []), ('commit_methods', []) 26 | ]) 27 | 28 | def append_cache(self, key, value): 29 | """Appends record to array on disk ccache 30 | :param key: data key 31 | :param value: value to append 32 | """ 33 | temp_ = self.data[key] 34 | temp_.append(value) 35 | self.data[key] = temp_ 36 | -------------------------------------------------------------------------------- /graphrepo/drillers/driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ This module uses pydriller to search a repository 16 | and indexes it in neo4j 17 | """ 18 | from diskcache import Cache 19 | from datetime import datetime 20 | from py2neo import Graph 21 | from pydriller import RepositoryMining 22 | 23 | import graphrepo.utils as utl 24 | import graphrepo.drillers.batch_utils as b_utl 25 | from graphrepo.config import Config 26 | from graphrepo.drillers.drill_cache import DrillCacheSequential 27 | from graphrepo.drillers.default import DefaultDriller 28 | from graphrepo.logger import Logger 29 | 30 | LG = Logger() 31 | 32 | 33 | class Driller(DefaultDriller): 34 | """Drill class - parses a git repo and uses the models 35 | to index everything in Neo4j. This class is a singleton 36 | because it holds the connection to Neo4j in self.graph 37 | """ 38 | 39 | def index_batch(self, **kwargs): 40 | """Indexes data extracted by drill_batch of from 41 | disk in Neo4j 42 | :param kwargs: data keys and values (see the drill_batch return) 43 | """ 44 | try: 45 | self.config.check_config() 46 | self._check_connection() 47 | b_utl.index_all( 48 | self.graph, config=self.config.ct, **kwargs) 49 | except Exception as exc: 50 | LG.log_and_raise(exc) 51 | else: 52 | return 53 | 54 | def index_from_file(self, file_path): 55 | """Reads a file and indexes the data in Neo4j 56 | :param file_path: the path of the JSON file with data 57 | """ 58 | try: 59 | data_ = utl.load_json(file_path) 60 | self.index_batch(**data_) 61 | except Exception as exc: 62 | LG.log_and_raise(exc) 63 | else: 64 | return -------------------------------------------------------------------------------- /graphrepo/drillers/queue_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Default Parent class for drillers 15 | """ 16 | from abc import abstractmethod 17 | from datetime import datetime 18 | from py2neo import Graph 19 | from pydriller import RepositoryMining 20 | 21 | import graphrepo.utils as utl 22 | from graphrepo.config import Config 23 | from graphrepo.drillers.driller import Driller 24 | import graphrepo.drillers.batch_utils as b_utl 25 | from graphrepo.logger import Logger 26 | 27 | LG = Logger() 28 | 29 | 30 | class QueueDriller(Driller): 31 | """QueueDriller class - parses a git repo and publishes 32 | the data in a queue every n commits 33 | """ 34 | 35 | def __init__(self, neo, project, queue): 36 | """Initializes the properties of this class 37 | :param neo: 38 | :param project: 39 | :param queue: 40 | """ 41 | # TODO: validate inputs 42 | try: 43 | self.project, self.queue = project, queue 44 | self.config = Config() 45 | self.graph = None 46 | self.config.configure(**neo, **self.project) 47 | # self._connect() 48 | except Exception as exc: 49 | LG.log_and_raise(exc) 50 | 51 | @abstractmethod 52 | def connect_queue(self): 53 | """Establishes a connection to queue""" 54 | raise NotImplementedError 55 | 56 | @abstractmethod 57 | def send_index_data(self, data): 58 | """Indexes data""" 59 | raise NotImplementedError 60 | 61 | def drill_batch(self, index=True, save_path=None): 62 | """Extracts data from a software repository, with the option 63 | of saving it on diks and indexing it in Neo4j 64 | :param index: optional, if True, the data is indexed in Neo4j 65 | :param save_path: optional, if given, the data is stored on dik 66 | :returns: dictionary with all data 67 | """ 68 | start = datetime.now() 69 | print('Driller started at: \t', start) 70 | commits, parents, devs, dev_com, branches,\ 71 | branches_com, files, com_files, \ 72 | methods, files_methods, com_methods = \ 73 | [], [], [], [], [], [], [], [], [], [], [] 74 | commit_index = 0 75 | for commit in \ 76 | RepositoryMining(self.config.ct.repo, 77 | since=self.config.ct.start_date, 78 | to=self.config.ct.end_date).traverse_commits(): 79 | 80 | self.drill_commit(commit, commits, parents, devs, dev_com, branches, 81 | branches_com, files, com_files, 82 | methods, files_methods, com_methods) 83 | 84 | if commit_index == self.queue['commit_batch'] - 1: 85 | data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches, 86 | branches_com, files, com_files, 87 | methods, files_methods, com_methods) 88 | 89 | self.send_index_data( 90 | {'project_conf': self.project, 'data': data_}) 91 | 92 | commits, parents, devs, dev_com, branches, branches_com, files, com_files, methods, files_methods, com_methods = [ 93 | ], [], [], [], [], [], [], [], [], [], [] 94 | commit_index = 0 95 | else: 96 | commit_index += 1 97 | 98 | print('Driller finished in: \t', datetime.now() - start) 99 | -------------------------------------------------------------------------------- /graphrepo/drillers/rabbit_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Default Parent class for drillers 15 | """ 16 | import json 17 | import pika 18 | 19 | from abc import abstractmethod 20 | from datetime import datetime 21 | from py2neo import Graph 22 | from pydriller import RepositoryMining 23 | 24 | import graphrepo.utils as utl 25 | from graphrepo.config import Config 26 | from graphrepo.drillers.queue_driller import QueueDriller 27 | import graphrepo.drillers.batch_utils as b_utl 28 | from graphrepo.logger import Logger 29 | 30 | LG = Logger() 31 | 32 | 33 | class RabbitDriller(QueueDriller): 34 | """RabbitDriller class - parses a git repo and publishes 35 | the data in a queue every n commits 36 | """ 37 | 38 | def connect_queue(self): 39 | """Establishes a connection to queue""" 40 | try: 41 | credentials = pika.PlainCredentials( 42 | self.queue['username'], self.queue['password']) 43 | self.con_parameters = pika.ConnectionParameters(self.queue['host'], 44 | self.queue['port'], 45 | self.queue['vhost'], 46 | credentials) 47 | connection = pika.BlockingConnection( 48 | self.con_parameters) 49 | channel = connection.channel() 50 | 51 | channel.queue_declare(queue=self.queue['queue'], durable=True) 52 | return connection, channel 53 | except Exception as e: 54 | raise e 55 | 56 | def send_index_data(self, data): 57 | """Indexes data""" 58 | try: 59 | connection, channel = self.connect_queue() 60 | channel.basic_publish( 61 | exchange='', 62 | routing_key=self.queue['queue'], 63 | body=json.dumps(data), 64 | properties=pika.BasicProperties( 65 | delivery_mode=2, # make message persistent 66 | )) 67 | connection.close() 68 | except Exception as e: 69 | raise e 70 | -------------------------------------------------------------------------------- /graphrepo/drillers/stomp_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Default Parent class for drillers 15 | """ 16 | import stomp 17 | import json 18 | 19 | from abc import abstractmethod 20 | from datetime import datetime 21 | from py2neo import Graph 22 | from pydriller import RepositoryMining 23 | 24 | import graphrepo.utils as utl 25 | from graphrepo.config import Config 26 | from graphrepo.drillers.queue_driller import QueueDriller 27 | import graphrepo.drillers.batch_utils as b_utl 28 | from graphrepo.logger import Logger 29 | 30 | LG = Logger() 31 | 32 | 33 | class StompDriller(QueueDriller): 34 | """StompDriller class - parses a git repo and publishes 35 | the data in a queue every n commits 36 | """ 37 | 38 | def connect_queue(self): 39 | """Establishes a connection to queue""" 40 | try: 41 | conn = stomp.Connection( 42 | [(self.queue['host'], self.queue['port']) 43 | ], vhost=self.queue['vhost'], heartbeats=(10000, 10000) 44 | ) 45 | 46 | conn.connect(self.queue['username'], 47 | self.queue['password'], wait=True) 48 | return conn 49 | except Exception as e: 50 | raise e 51 | 52 | def send_index_data(self, data): 53 | """Indexes data""" 54 | try: 55 | conn = self.connect_queue() 56 | conn.send(body=json.dumps(data), destination=self.queue.queue) 57 | conn.disconnect() 58 | except Exception as e: 59 | raise e 60 | -------------------------------------------------------------------------------- /graphrepo/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Logger""" 15 | from graphrepo.singleton import Singleton 16 | 17 | 18 | class Logger(metaclass=Singleton): 19 | def __init__(self, *args, **kwargs): 20 | """Default init""" 21 | 22 | def log(self, exception): 23 | """Logs exceptions and prints it to console 24 | :param exception: Exception type from Python 25 | """ 26 | print('[EXCEPTION]: {}'.format(exception)) 27 | 28 | def log_and_raise(self, exception): 29 | """Logs, prints and raises exception 30 | :param exception: Python Exception object 31 | """ 32 | self.log(exception) 33 | raise exception 34 | -------------------------------------------------------------------------------- /graphrepo/mappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .csv import CSVMapper 2 | from .default import DefaultMapper 3 | -------------------------------------------------------------------------------- /graphrepo/mappers/csv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module is a custom miner class with some abstractions""" 15 | from abc import abstractmethod 16 | import pandas as pd 17 | 18 | from graphrepo.mappers.default import DefaultMapper 19 | 20 | 21 | class CSVMapper(DefaultMapper): 22 | """The miners are currently synchronous, but 23 | ideally they will be async in the future""" 24 | 25 | def map(self, objects): 26 | """The csv default map function 27 | assumes the objectss are of the type 28 | 29 | """ 30 | return pd.DataFrame(objects) 31 | -------------------------------------------------------------------------------- /graphrepo/mappers/default.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module is a custom mapper class with some abstractions""" 15 | from abc import abstractmethod 16 | 17 | 18 | class DefaultMapper(): 19 | """The miners are currently synchronous, but 20 | ideally they will be async in the future""" 21 | 22 | def __init__(self, *args, **kwargs): 23 | pass 24 | -------------------------------------------------------------------------------- /graphrepo/miners/__init__.py: -------------------------------------------------------------------------------- 1 | from .commit import CommitMiner 2 | from .default import DefaultMiner 3 | from .developer import DeveloperMiner 4 | from .file import FileMiner 5 | from .mine_manager import MineManager 6 | from .method import MethodMiner 7 | -------------------------------------------------------------------------------- /graphrepo/miners/commit.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module mines commits and contains all related Neo4j queries""" 15 | 16 | from graphrepo.miners.default import DefaultMiner 17 | from graphrepo.miners.utils import format_commit_id_date 18 | 19 | 20 | class CommitMiner(DefaultMiner): 21 | """This class holds queries for commits""" 22 | 23 | def query(self, **kwargs): 24 | """Queries commits by any arguments given in kwargs 25 | For example kwargs can be {'hash': 'example-hash'} 26 | :param kwargs: any parameter and value, between hash, name or email 27 | :returns: list of commit nodes matched 28 | """ 29 | com_ = self.node_matcher.match("Commit", **kwargs) 30 | return [dict(x) for x in com_] 31 | 32 | def get_between_dates(self, start_date, end_date, 33 | project_id=None): 34 | """Returns all commits between start and end date 35 | :param start_date: timestamp, start date 36 | :param end_date: timestamp, end date 37 | :param project_id: optional; if given only the commits from a project 38 | are returned 39 | :returns: list of commitss 40 | """ 41 | com_filter, where = format_commit_id_date( 42 | project_id, start_date, end_date) 43 | query = """ 44 | MATCH (c: Commit {0}) 45 | {1} 46 | RETURN distinct c 47 | """.format(com_filter, where) 48 | dt_ = self.graph.run(query) 49 | return [dict(x['c']) for x in dt_.data()] 50 | 51 | def get_all(self,): 52 | """Returns all commits 53 | :returns: list of commit nodes 54 | """ 55 | com_ = self.node_matcher.match("Commit") 56 | return [dict(x) for x in com_] 57 | 58 | def get_commit_files(self, commit_hash): 59 | """Returns the files updated in a commit 60 | :param commit_hash: optional; if given, it will 61 | return the data only for one commit 62 | :returns: list of commit files 63 | """ 64 | query = """ 65 | MATCH (c:Commit {{hash: "{0}"}}) 66 | -[UpdateFile]->(f:File) 67 | return distinct f 68 | """.format(commit_hash) 69 | files_ = self.graph.run(query) 70 | return [x['f'] for x in files_.data()] 71 | 72 | def get_commit_file_updates(self, commit_hash): 73 | """Returns the updates a commit made to files (UpdateFile rel) 74 | :param commit_hash: optional; if given, it will 75 | return the data only for one commit 76 | :returns: list of 77 | """ 78 | query = """ 79 | MATCH (c:Commit {{hash: "{0}"}}) 80 | -[f: UpdateFile]->(fu:File) 81 | return distinct f 82 | """.format(commit_hash) 83 | files_ = self.graph.run(query) 84 | return [x['f'] for x in files_.data()] 85 | 86 | def get_commit_methods(self, commit_hash=None): 87 | """Returns the methods updated in a commit 88 | :param commit_hash: optional; if given, it will 89 | return the data only for one commit 90 | """ 91 | query = """ 92 | MATCH (c:Commit {{hash: "{0}"}}) 93 | -[UpdateMethod]->(m:Method) 94 | return distinct m 95 | """.format(commit_hash) 96 | files_ = self.graph.run(query) 97 | return [x['m'] for x in files_.data()] 98 | 99 | def get_commit_method_updates(self, commit_hash=None): 100 | """Returns the updatemethod relationships from a commit 101 | :param commit_hash: optional; if given, 102 | it will return the data only for one commit 103 | :param dic: optional, boolean for ocnverting the data to dictionaries 104 | """ 105 | query = """ 106 | MATCH (c:Commit {{hash: "{0}"}}) 107 | -[m:UpdateMethod]->(mu:Method) 108 | return distinct m 109 | """.format(commit_hash) 110 | files_ = self.graph.run(query) 111 | return [x['m'] for x in files_.data()] 112 | -------------------------------------------------------------------------------- /graphrepo/miners/default.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module is a custom miner class with some abstractions""" 15 | from abc import abstractmethod 16 | 17 | 18 | class DefaultMiner(): 19 | """The miners are currently synchronous, but 20 | ideally they will be async in the future""" 21 | 22 | def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs): 23 | self.graph = graph 24 | self.node_matcher = node_matcher 25 | self.rel_matcher = rel_matcher 26 | 27 | @abstractmethod 28 | def get_all(self): 29 | """This method returns all artifacts 30 | found by a miner""" 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /graphrepo/miners/developer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module mines developers and contains all related Neo4j queries""" 15 | 16 | from graphrepo.miners.default import DefaultMiner 17 | from graphrepo.miners.utils import format_commit_id_date as fcid 18 | 19 | 20 | class DeveloperMiner(DefaultMiner): 21 | """This class holds queries for the Developer nodes""" 22 | 23 | def query(self, **kwargs): 24 | """Queries developers by any arguments given in kwargs 25 | For example kwargs can be {'hash': 'example-hash'} or 26 | {'email': 'example-email'} 27 | :param kwargs: any parameter and value, between hash, name or email 28 | :returns: list of nodes matched 29 | """ 30 | return self.node_matcher.match("Developer", **kwargs) 31 | 32 | def get_commits(self, dev_hash, project_id=None, 33 | start_date=None, end_date=None): 34 | """Returns all commits authored by a developer. 35 | Optionally, it also filters by project id 36 | :param dev_hash: developer unique identifier 37 | :param project_id: optional; if present the 38 | query returns the commits from a project 39 | :param start_date: optional timestamp; filter commits 40 | beginning with this date 41 | :param end_date: optional timestamp; filter commits 42 | untill this date 43 | :returns: list of commits 44 | """ 45 | com_filter, where = fcid(project_id, 46 | start_date, end_date) 47 | cquery = """ 48 | MATCH (d:Developer {{hash: "{0}"}}) 49 | -[r:Author]-> 50 | (c:Commit {1}) 51 | {2} 52 | RETURN distinct c; 53 | """.format(dev_hash, com_filter, where) 54 | dt_ = self.graph.run(cquery) 55 | return [dict(x['c']) for x in dt_.data()] 56 | 57 | def get_files(self, dev_hash, project_id=None, 58 | start_date=None, end_date=None): 59 | """Returns all files edited by a developer. 60 | Optionally it also filters by project_id 61 | :params dev_hash: developer unique identifier 62 | :params project_id: optional; if present the query 63 | returns the files from a specific project 64 | :param start_date: optional timestamp; filter files 65 | beginning with this date 66 | :param end_date: optional timestamp; filter files 67 | untill this date 68 | :returns: list of files 69 | """ 70 | com_filter, where = fcid(project_id, 71 | start_date, end_date) 72 | fquery = """ 73 | MATCH (d:Developer {{hash: "{0}"}}) 74 | -[r:Author]-> 75 | (c:Commit {1}) 76 | -[UpdateFile]-> 77 | (f: File) 78 | {2} 79 | RETURN collect(distinct f); 80 | """.format(dev_hash, com_filter, where) 81 | dt_ = self.graph.run(fquery) 82 | return [dict(x) for x in dt_.data()[0]['collect(distinct f)']] 83 | 84 | def get_files_updates(self, dev_hash, project_id=None, 85 | start_date=None, end_date=None): 86 | """Returns all file update information (e.g. file complexity), 87 | for all files edited by a developer. 88 | Optionally it also filters by project_id 89 | :params dev_hash: developer unique identifier 90 | :params project_id: optional; if present the query 91 | returns the files from a specific project 92 | :param start_date: optional timestamp; filter files 93 | beginning with this date 94 | :param end_date: optional timestamp; filter files 95 | untill this date 96 | :returns: list of file updates 97 | """ 98 | com_filter, where = fcid(project_id, 99 | start_date, end_date) 100 | fuquery = """ 101 | MATCH (d:Developer {{hash: "{0}"}}) 102 | -[r:Author]-> 103 | (c:Commit {1}) 104 | -[fu: UpdateFile]-> 105 | (f: File) 106 | {2} 107 | RETURN distinct fu; 108 | """.format(dev_hash, com_filter, where) 109 | 110 | dt_ = self.graph.run(fuquery) 111 | return [dict(x['fu']) for x in dt_.data()] 112 | 113 | def get_methods(self, dev_hash, project_id=None, 114 | start_date=None, end_date=None): 115 | """Returns all methods updated by a developer. 116 | Optionally it also filters by project_id 117 | :params dev_hash: developer unique identifier 118 | :params project_id: optional; if present the query 119 | returns the files from a specific project 120 | :param start_date: optional timestamp; filter files 121 | beginning with this date 122 | :param end_date: optional timestamp; filter files 123 | untill this date 124 | :returns: list of methods 125 | """ 126 | com_filter, where = fcid(project_id, 127 | start_date, end_date) 128 | mquery = """ 129 | MATCH (d:Developer {{hash: "{0}"}}) 130 | -[r:Author]-> 131 | (c:Commit {1}) 132 | -[um: UpdateMethod]-> 133 | (m: Method) 134 | {2} 135 | RETURN distinct m; 136 | """.format(dev_hash, com_filter, where) 137 | 138 | dt_ = self.graph.run(mquery) 139 | return [dict(x['m']) for x in dt_.data()] 140 | 141 | def get_method_updates(self, dev_hash, project_id=None, 142 | start_date=None, end_date=None): 143 | """Returns all method update information, for all 144 | methods update by a developer. 145 | Optionally it also filters by project_id 146 | :params dev_hash: developer unique identifier 147 | :params project_id: optional; if present the query 148 | returns the files from a specific project 149 | :param start_date: optional timestamp; filter files 150 | beginning with this date 151 | :param end_date: optional timestamp; filter files 152 | untill this date 153 | :returns: list of method updates 154 | """ 155 | com_filter, where = fcid(project_id, 156 | start_date, end_date) 157 | muquery = """ 158 | MATCH (d:Developer {{hash: "{0}"}}) 159 | -[r:Author]-> 160 | (c:Commit {1}) 161 | -[um: UpdateMethod]-> 162 | () 163 | {2} 164 | RETURN distinct um; 165 | """.format(dev_hash, com_filter, where) 166 | 167 | dt_ = self.graph.run(muquery) 168 | return [dict(x['um']) for x in dt_.data()] 169 | 170 | def get_all(self): 171 | return self.node_matcher.match("Developer") 172 | -------------------------------------------------------------------------------- /graphrepo/miners/file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module mines files and contains all related Neo4j queries""" 15 | 16 | from graphrepo.miners.default import DefaultMiner 17 | 18 | 19 | class FileMiner(DefaultMiner): 20 | """This clas holds queries for the File nodes""" 21 | 22 | def query(self, **kwargs): 23 | """Searches for a file using the arguments in kwargs. 24 | If no kwargs are given it returns the first file found 25 | """ 26 | return self.node_matcher.match("File", **kwargs).first() 27 | 28 | def get_all(self): 29 | """Returns all node of type File 30 | :return: list of files 31 | """ 32 | return self.node_matcher.match("File") 33 | 34 | def get_change_history(self, file_hash): 35 | """Returns all updated relationships 36 | :param file_hash: a string, unique identifier for file 37 | :param dic: optional; boolean for converting data to dictionary 38 | or returning it as py2neo records - the py2neo raw 39 | records can be used in mappers 40 | :return: list of update file relationships 41 | """ 42 | query = """MATCH ()-[r:UpdateFile]->(f:File {{hash: "{0}"}}) 43 | return distinct r 44 | """.format(file_hash) 45 | dt_ = self.graph.run(query) 46 | return [dict(x['r']) for x in dt_.data()] 47 | 48 | def get_current_methods(self, file_hash): 49 | """Returns all current methods 50 | :param file_hash: a string, unique identifier for file 51 | :param dic: optional; boolean for converting data to dictionary 52 | or returning it as py2neo records - the py2neo raw 53 | records can be used in mappers 54 | :return: list of methods 55 | """ 56 | query = """MATCH (f:File {{hash: "{0}"}})-[r:Method]->(m:Method) 57 | return distinct m 58 | """.format(file_hash) 59 | dt_ = self.graph.run(query) 60 | return [dict(x['m']) for x in dt_.data()] 61 | 62 | def get_past_methods(self, file): 63 | """Returns methods that were removed from the file 64 | :param file: Py2Neo File object 65 | :returrn: list of Method objects 66 | """ 67 | # return [rel.end_node 68 | # for rel in self.graph.match([file, None], "HadMethod")] 69 | -------------------------------------------------------------------------------- /graphrepo/miners/method.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module mines files and contains all related Neo4j queries""" 15 | 16 | from graphrepo.miners.default import DefaultMiner 17 | 18 | 19 | class MethodMiner(DefaultMiner): 20 | def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs): 21 | super().__init__(graph, node_matcher, rel_matcher, *args, **kwargs) 22 | 23 | def query(self, **kwargs): 24 | """Searches for a method using the arguments in kwargs. 25 | If no kwargs are given it returns the first method found 26 | """ 27 | return self.node_matcher.match("Method", **kwargs).first() 28 | 29 | def get_all(self): 30 | """Returns all node of type Method 31 | :return: list of method 32 | """ 33 | return self.node_matcher.match("Method") 34 | 35 | def get_change_history(self, method_hash): 36 | """Returns all UpdateMethod relationships 37 | :param method_hash: method unique identifier 38 | :param dic: optional; boolean for converting data to dictionary 39 | or returning it as py2neo records - the py2neo raw 40 | records can be used in mappers 41 | :return: list of UpdateMethod relationships / dics 42 | """ 43 | query = """MATCH ()-[r:UpdateMethod]->(m: Method{{hash: "{0}"}}) 44 | RETURN distinct r 45 | """.format(method_hash) 46 | dt_ = self.graph.run(query) 47 | return [dict(x['r']) for x in dt_.data()] 48 | -------------------------------------------------------------------------------- /graphrepo/miners/mine_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """This module initializes and configures all miners""" 15 | from py2neo import Graph, NodeMatcher, RelationshipMatcher 16 | import graphrepo.utils as utl 17 | from graphrepo.config import Config 18 | from graphrepo.logger import Logger 19 | from graphrepo.singleton import Singleton 20 | from graphrepo import miners 21 | 22 | 23 | LG = Logger() 24 | 25 | 26 | class MineManager(metaclass=Singleton): 27 | """MineManageer class - This class manages custom 28 | miners. At the moment we instantiate all miners, 29 | but other managers which handle different 'teams of miners' 30 | can be created. 31 | """ 32 | 33 | def __init__(self, config_path): 34 | """Initializes the properties of this class""" 35 | self.commit_miner, self.dev_miner, \ 36 | self.file_miner, self.method_miner = None, None, None, None 37 | try: 38 | if not config_path: 39 | raise FileNotFoundError 40 | neo, project = utl.parse_config(config_path) 41 | self.config = Config() 42 | self.config.configure(**neo, **project) 43 | self.graph = None 44 | self.node_matcher = None 45 | self.rel_matcher = None 46 | self.connect() 47 | except Exception as exc: 48 | LG.log_and_raise(exc) 49 | 50 | def connect(self): 51 | """Instantiates the connection to Neo4j and stores 52 | the graph internally. 53 | Throws exception if the connection can not pe realized 54 | """ 55 | try: 56 | self.graph = Graph(host=self.config.ct.db_url, 57 | user=self.config.ct.db_user, 58 | password=self.config.ct.db_pwd, 59 | http_port=self.config.ct.port) 60 | self.node_matcher = NodeMatcher(self.graph) 61 | self.rel_matcher = RelationshipMatcher(self.graph) 62 | self.init_miners() 63 | except Exception as exc: 64 | LG.log_and_raise(exc) 65 | 66 | def check_connection(self): 67 | """Checks if there is a db connection and raises 68 | ReferenceError if not. 69 | """ 70 | try: 71 | self.connect() 72 | except: 73 | raise ReferenceError("There is no valid " 74 | "database connection. Please " 75 | "configure and connect first.") 76 | 77 | def init_miners(self): 78 | """Initializes all miners""" 79 | try: 80 | # TODO: Parse this automatically? 81 | self.commit_miner = miners.CommitMiner( 82 | graph=self.graph, 83 | node_matcher=self.node_matcher, 84 | rel_matcher=self.rel_matcher) 85 | self.dev_miner = \ 86 | miners.DeveloperMiner(graph=self.graph, 87 | node_matcher=self.node_matcher, 88 | rel_matcher=self.rel_matcher) 89 | self.file_miner = \ 90 | miners.FileMiner(graph=self.graph, 91 | node_matcher=self.node_matcher, 92 | rel_matcher=self.rel_matcher) 93 | self.method_miner = \ 94 | miners.MethodMiner(graph=self.graph, 95 | node_matcher=self.node_matcher, 96 | rel_matcher=self.rel_matcher) 97 | 98 | except Exception as exc: 99 | LG.log_and_raise(exc) 100 | else: 101 | return 102 | 103 | def get_all_data(self): 104 | """Returns all nodes and relationships from Neo4j 105 | :returns: a tuple with two arrays: the first with nodes, 106 | the second with relationships 107 | """ 108 | nodes = self.node_matcher.match() 109 | rels = self.rel_matcher.match() 110 | 111 | return list(nodes), list(rels) 112 | -------------------------------------------------------------------------------- /graphrepo/miners/utils.py: -------------------------------------------------------------------------------- 1 | """Utils methods for miners""" 2 | 3 | 4 | def format_commit_id_date(project_id, start_date, end_date, commit_hash=None): 5 | """Formats commit query with id and dates 6 | :param project_id: the project unique identifier 7 | :param start_date: timestamp, commit start_date 8 | :param end_date: timestamp, commit end_date 9 | :param ccommit_hash: optional, if given the query 10 | filters by commit hash 11 | :returns: query filter string and where clause 12 | """ 13 | com_filter, where = "", "" 14 | if project_id and not commit_hash: 15 | com_filter += """{{project_id: "{0}"}}""".format(project_id) 16 | if project_id and commit_hash: 17 | com_filter += """{{project_id: "{0}", hash: {1}}}""".format( 18 | project_id, commit_hash) 19 | if start_date: 20 | where += "c.timestamp >= {0}".format(start_date) 21 | if end_date: 22 | where += " AND " if where else "" 23 | where += "c.timestamp <= {0}".format(end_date) 24 | where = "WHERE " + where if where else where 25 | 26 | return com_filter, where 27 | -------------------------------------------------------------------------------- /graphrepo/singleton.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Singleton metaclass""" 16 | 17 | 18 | class Singleton(type): 19 | """ 20 | Define an Instance operation that lets clients access its unique 21 | instance. 22 | """ 23 | 24 | def __init__(cls, name, bases, attrs, *args, **kwargs): 25 | super().__init__(name, bases, attrs) 26 | cls._instance = None 27 | 28 | def __call__(cls, *args, **kwargs): 29 | if cls._instance is None: 30 | cls._instance = super().__call__(*args, **kwargs) 31 | return cls._instance 32 | -------------------------------------------------------------------------------- /graphrepo/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 GraphRepo 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utils methods for GraphRepo""" 15 | import json 16 | import hashlib 17 | from datetime import datetime 18 | import yaml 19 | 20 | 21 | class Dotdict(dict): 22 | """dot.notation access to dictionary attributes""" 23 | __getattr__ = dict.get 24 | __setattr__ = dict.__setitem__ 25 | __delattr__ = dict.__delitem__ 26 | 27 | 28 | def parse_config(path): 29 | with open(path, 'r') as ymlfile: 30 | conf = yaml.load(ymlfile, Loader=yaml.FullLoader) 31 | 32 | neo = conf['neo'] 33 | project = conf['project'] 34 | 35 | project['start_date'] = datetime.strptime( 36 | project['start_date'], '%d %B, %Y %H:%M') \ 37 | if project['start_date'] else None 38 | project['end_date'] = datetime.strptime( 39 | project['end_date'], '%d %B, %Y %H:%M') \ 40 | if project['end_date'] else None 41 | 42 | return neo, project 43 | 44 | 45 | def save_json(path, data): 46 | with open(path, 'w') as outfile: 47 | json.dump(data, outfile) 48 | 49 | 50 | def load_json(path): 51 | with open(path) as json_file: 52 | return json.load(json_file) 53 | 54 | 55 | def get_file_hash(file, project_id=None, use_new_path=False): 56 | name = '' 57 | if not file.old_path and file.new_path: 58 | # ADD File 59 | name = name + file.new_path 60 | elif file.old_path and not file.new_path: 61 | # DELETE 62 | name = name+file.old_path 63 | elif file.old_path and file.new_path: 64 | # MODIFY OR RENAME 65 | if use_new_path: 66 | name = name + file.new_path 67 | else: 68 | name = name + file.old_path 69 | 70 | name = name+file.filename 71 | name = project_id + name if project_id else name 72 | return hashlib.sha224(str(name).encode('utf-8')).hexdigest() 73 | 74 | 75 | def get_method_type(method, m_before, m_current): 76 | if method.name in m_before and method.name not in m_current: 77 | return "DELETE" 78 | elif method.name in m_before and method.name in m_current: 79 | return "MODIFY" 80 | else: 81 | return "ADD" 82 | 83 | 84 | def get_method_hash(method, file, project_id=None): 85 | fhash = get_file_hash(file, project_id) 86 | _fmname = fhash + "_" + method.name 87 | _fmname = project_id + _fmname if project_id else _fmname 88 | return hashlib.sha224(_fmname.encode('utf-8')).hexdigest() 89 | 90 | 91 | def get_author_hash(email): 92 | return hashlib.sha224(email.encode('utf-8')).hexdigest() 93 | 94 | 95 | def format_dev(dev, index_email=True): 96 | return { 97 | 'name': dev.author.name, 98 | 'email': dev.author.email if index_email else '', 99 | 'hash': get_author_hash(dev.author.email) 100 | } 101 | 102 | 103 | def get_commit_hash(chash, project_id): 104 | return hashlib.sha224(str(project_id + chash).encode('utf-8')).hexdigest() 105 | 106 | 107 | def format_commit(com, project_id): 108 | return { 109 | 'hash': get_commit_hash(com.hash, project_id), 110 | 'commit_hash': com.hash, 111 | 'message': com.msg, 112 | 'is_merge': 1 if com.merge else 0, 113 | 'timestamp': com.author_date.timestamp(), 114 | 'project_id': project_id, 115 | 'dmm_unit_complexity': com.dmm_unit_complexity if com.dmm_unit_complexity else -1, 116 | 'dmm_unit_interfacing': com.dmm_unit_interfacing if com.dmm_unit_interfacing else -1, 117 | 'dmm_unit_size': com.dmm_unit_size if com.dmm_unit_size else -1, 118 | } 119 | 120 | 121 | def format_parent_commit(c_hash, parent_hash, project_id=None): 122 | return { 123 | 'child_hash': c_hash, 124 | 'parent_hash': get_commit_hash(parent_hash, project_id) 125 | } 126 | 127 | 128 | def format_branch(name, project_id): 129 | return { 130 | 'hash': hashlib.sha224(str(project_id+name).encode('utf-8')).hexdigest(), 131 | 'project_id': project_id, 132 | 'name': name 133 | } 134 | 135 | 136 | def format_author_commit(dev, com, timestamp): 137 | return {'commit_hash': com['hash'], 138 | 'author_hash': dev['hash'], 139 | 'timestamp': timestamp, 140 | } 141 | 142 | 143 | def format_branch_commit(bhash, chash): 144 | return {'branch_hash': bhash, 145 | 'commit_hash': chash 146 | } 147 | 148 | 149 | def format_file(file, project_id): 150 | return { 151 | 'hash': get_file_hash(file, project_id), 152 | 'merge_hash': get_file_hash(file, project_id, use_new_path=True), 153 | 'name': file.filename, 154 | 'project_id': project_id, 155 | 'type': '.' + file.filename.split('.')[-1:][0] 156 | } 157 | 158 | 159 | def format_commit_file(c_hash, file, timestamp, project_id, index_code=True): 160 | f_hash = get_file_hash(file, project_id) 161 | f_merge_hash = get_file_hash(file, project_id, use_new_path=True) 162 | dt_ = {'commit_hash': c_hash, 'file_hash': f_hash, 163 | 'attributes': { 164 | 'timestamp': timestamp, 165 | 'old_path': file.old_path if file.old_path else '', 166 | 'path': file.new_path if file.new_path else '', 167 | 'source_code': '', 168 | 'source_code_before': '', 169 | 'diff': file.diff, 170 | 'nloc': file.nloc if file.nloc else -1, 171 | 'complexity': file.complexity if file.complexity else -1, 172 | 'token_count': file.token_count if file.token_count else -1, 173 | 'added': file.added, 174 | 'removed': file.removed, 175 | 'type': file.change_type.name, 176 | 'f_hash': f_hash, 177 | 'm_hash': f_merge_hash}} 178 | 179 | if index_code: 180 | dt_['attributes']['source_code'] = str( 181 | file.source_code) if file.source_code else '', 182 | dt_['attributes']['source_code_before'] = str( 183 | file.source_code_before) if file.source_code_before else '', 184 | 185 | return dt_ 186 | 187 | 188 | def format_commit_method(c_hash, m_hash, met, timestamp): 189 | return { 190 | 'commit_hash': c_hash, 191 | 'method_hash': m_hash, 192 | 'attributes': { 193 | 'timestamp': timestamp, 194 | 'long_name': met.long_name, 195 | 'parameters': met.parameters, 196 | 'complexity': met.complexity, 197 | 'nloc': met.nloc, 198 | 'fan_in': met.fan_in, 199 | 'fan_out': met.fan_out, 200 | 'general_fan_out': met.general_fan_out, 201 | 'length': met.length, 202 | 'token_count': met.token_count, 203 | 'start_line': met.start_line, 204 | 'end_line': met.end_line}} 205 | 206 | 207 | def format_method(met, fille, project_id): 208 | return { 209 | 'hash': get_method_hash(met, fille, project_id), 210 | 'name': met.name, 211 | 'file_name': met.filename, 212 | 'project_id': project_id} 213 | 214 | 215 | def format_file_method(f_hash, m_hash): 216 | return {'file_hash': f_hash, 'method_hash': m_hash} 217 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lizard==1.16.6 2 | pytz==2018.9 3 | psutil==5.7.0 4 | py2neo==4.3.0 5 | pydriller==1.15.1 6 | requests==2.21.0 7 | pytest==5.3.5 8 | GitPython==3.1.0 9 | PyYAML==5.3.1 10 | diskcache==4.1.0 11 | pika==1.1.0 12 | stomp.py==6.1.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # v0.3.5 released 2 | from setuptools import setup, find_packages 3 | 4 | with open('requirements.txt') as reqs_file: 5 | requirements = reqs_file.read().splitlines() 6 | 7 | setup(name="graphrepo", 8 | version="0.3.5", 9 | description="A tool that maps a Github repo to Neo4j and Helps Mining the Repo in the DB", 10 | url="https://github.com/NullConvergence/GraphRepo", 11 | license='Apache License', 12 | python_requires='>=3.5', 13 | install_requires=requirements, 14 | packages=find_packages('.'), 15 | package_dir={'graphrepo': 'graphrepo'}) 16 | 17 | # python3 setup.py sdist bdist_wheel 18 | # python3 -m twine upload dist/* 19 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/__init__.py -------------------------------------------------------------------------------- /tests/cnfg_init.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 200 7 | 8 | project: 9 | repo: tests/gr-test 10 | start_date: "14 May, 2020 00:00" 11 | end_date: "15 May, 2020 23:00" 12 | project_id: 'graph_repo_test' 13 | index_code: False 14 | index_developer_email: True -------------------------------------------------------------------------------- /tests/cnfg_simple.yml: -------------------------------------------------------------------------------- 1 | neo: 2 | db_url: localhost 3 | port: 7687 4 | db_user: neo4j 5 | db_pwd: neo4jj 6 | batch_size: 200 7 | 8 | project: 9 | repo: tests/gr-test 10 | start_date: "14 May, 2020 00:00" 11 | end_date: "15 May, 2020 02:00" 12 | project_id: 'graph_repo_test' 13 | index_code: True 14 | index_developer_email: True -------------------------------------------------------------------------------- /tests/test_cache_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from graphrepo.drillers.cache_driller import CacheDriller 18 | 19 | 20 | class TestCacheDriller: 21 | def test_indexing(self): 22 | folder = os.path.dirname(os.path.abspath(__file__)) 23 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml')) 24 | test_driller.drill_batch_cache_sequential() 25 | records = [r for r in test_driller.graph.run( 26 | "MATCH(n) RETURN n")] 27 | assert len(records) == 22 28 | 29 | test_driller.clean() 30 | 31 | def test_drill_batch_cache(self): 32 | folder = os.path.dirname(os.path.abspath(__file__)) 33 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml')) 34 | test_driller.drill_batch_cache_all() 35 | records = [r for r in test_driller.graph.run( 36 | "MATCH(n) RETURN n")] 37 | assert len(records) == 22 38 | 39 | test_driller.clean() 40 | -------------------------------------------------------------------------------- /tests/test_commit.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pytest 17 | import yaml 18 | 19 | from py2neo import NodeMatcher, RelationshipMatcher 20 | from graphrepo.drillers.driller import Driller 21 | from graphrepo.drillers.cache_driller import CacheDriller 22 | 23 | 24 | class TestCommit: 25 | """Most data is indexed when indexing a commmit 26 | so this class tests indexing for multiple models""" 27 | 28 | def test_nodes_index(self): 29 | folder = os.path.dirname(os.path.abspath(__file__)) 30 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 31 | test_driller.drill_batch() 32 | 33 | # test that all nodes were indexed 34 | node_matcher = NodeMatcher(test_driller.graph) 35 | all_commits = list(node_matcher.match("Commit")) 36 | assert len(all_commits) == 8 37 | 38 | all_devs = list(node_matcher.match("Developer")) 39 | assert len(all_devs) == 2 40 | 41 | all_files = list(node_matcher.match("File")) 42 | assert len(all_files) == 6 43 | 44 | all_methods = list(node_matcher.match("Method")) 45 | assert len(all_methods) == 5 46 | 47 | all_branches = list(node_matcher.match("Branch")) 48 | assert len(all_branches) == 1 49 | 50 | test_driller.clean() 51 | 52 | def test_rel_index(self): 53 | folder = os.path.dirname(os.path.abspath(__file__)) 54 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 55 | test_driller.drill_batch() 56 | 57 | # test that all relationships were indexed 58 | rel_matcher = RelationshipMatcher(test_driller.graph) 59 | 60 | all_branch = list(rel_matcher.match(None, "BranchCommit")) 61 | assert len(all_branch) == 8 62 | 63 | all_authorship = list(rel_matcher.match(None, "Author")) 64 | assert len(all_authorship) == 8 65 | 66 | all_parent = list(rel_matcher.match(None, "Parent")) 67 | assert len(all_parent) == 8 68 | 69 | all_updadedfile = list(rel_matcher.match(None, "UpdateFile")) 70 | assert len(all_updadedfile) == 9 71 | 72 | all_hasmethod = list(rel_matcher.match(None, "Method")) 73 | assert len(all_hasmethod) == 5 74 | 75 | all_updatemethod = list(rel_matcher.match(None, "UpdateMethod")) 76 | assert len(all_updatemethod) == 9 77 | 78 | test_driller.clean() 79 | 80 | def test_rel_index_cache(self): 81 | folder = os.path.dirname(os.path.abspath(__file__)) 82 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_simple.yml')) 83 | test_driller.drill_batch_cache_sequential() 84 | 85 | # test that all relationships were indexed 86 | rel_matcher = RelationshipMatcher(test_driller.graph) 87 | 88 | all_branch = list(rel_matcher.match(None, "BranchCommit")) 89 | assert len(all_branch) == 8 90 | 91 | all_authorship = list(rel_matcher.match(None, "Author")) 92 | assert len(all_authorship) == 8 93 | 94 | all_parent = list(rel_matcher.match(None, "Parent")) 95 | assert len(all_parent) == 8 96 | 97 | all_updadedfile = list(rel_matcher.match(None, "UpdateFile")) 98 | assert len(all_updadedfile) == 9 99 | 100 | all_hasmethod = list(rel_matcher.match(None, "Method")) 101 | assert len(all_hasmethod) == 5 102 | 103 | all_updatemethod = list(rel_matcher.match(None, "UpdateMethod")) 104 | assert len(all_updatemethod) == 9 105 | 106 | test_driller.clean() 107 | 108 | def test_custom_attributes_rel(self): 109 | folder = os.path.dirname(os.path.abspath(__file__)) 110 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 111 | test_driller.drill_batch() 112 | 113 | node_matcher = NodeMatcher(test_driller.graph) 114 | rel_matcher = RelationshipMatcher(test_driller.graph) 115 | 116 | commit = node_matcher.match( 117 | "Commit", hash="aa6fa504ccb0fa919acc3cb31e510dc2048314eb0656f34babada15c").first() 118 | assert commit['is_merge'] == 0 119 | 120 | update_file_rel = rel_matcher.match([commit], "UpdateFile").first() 121 | assert update_file_rel['complexity'] == 2 122 | assert update_file_rel['nloc'] == 8 123 | assert update_file_rel['old_path'] == 'gr_test/default_class.py' 124 | assert update_file_rel['path'] == 'gr_test/default_class.py' 125 | assert update_file_rel['token_count'] == 42 126 | assert update_file_rel['type'] == 'MODIFY' 127 | assert update_file_rel['removed'] == 6 128 | assert update_file_rel['added'] == 0 129 | 130 | update_method_rel = rel_matcher.match( 131 | [commit], 'UpdateMethod').first() 132 | # assert update_method_rel['type'] == 'DELETE' 133 | assert update_method_rel['nloc'] == 5 134 | assert update_method_rel['complexity'] == 2 135 | assert update_method_rel['token_count'] == 21 136 | assert update_method_rel['length'] == 5 137 | assert update_method_rel['fan_in'] == 0 138 | assert update_method_rel['fan_out'] == 0 139 | assert update_method_rel['start_line'] == 11 140 | assert update_method_rel['end_line'] == 15 141 | 142 | test_driller.clean() 143 | -------------------------------------------------------------------------------- /tests/test_commit_miner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from datetime import datetime 15 | import os 16 | 17 | from py2neo import NodeMatcher, RelationshipMatcher 18 | from graphrepo.drillers.driller import Driller 19 | from graphrepo.miners.commit import CommitMiner 20 | 21 | 22 | class TestCommitMiner: 23 | def test_gets(self): 24 | folder = os.path.dirname(os.path.abspath(__file__)) 25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 26 | test_driller.drill_batch() 27 | 28 | st_date = datetime.strptime( 29 | '14 May, 2020 00:00', '%d %B, %Y %H:%M').timestamp() 30 | end_date = datetime.strptime( 31 | '15 May, 2020 02:00', '%d %B, %Y %H:%M').timestamp() 32 | 33 | n_matcher = NodeMatcher(test_driller.graph) 34 | r_matcher = RelationshipMatcher(test_driller.graph) 35 | 36 | com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher) 37 | 38 | all_com = com_miner.get_all() 39 | assert len(all_com) == 8 40 | 41 | all_com_dates = com_miner.get_between_dates(st_date, end_date) 42 | assert len(all_com_dates) == 8 43 | 44 | c_files = com_miner.get_commit_files( 45 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d') 46 | assert len(c_files) == 3 47 | 48 | c_file_updates = com_miner.get_commit_file_updates( 49 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d') 50 | assert len(c_file_updates) == 3 51 | 52 | c_methods = com_miner.get_commit_methods( 53 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d') 54 | assert len(c_methods) == 3 55 | 56 | c_method_updates = com_miner.get_commit_method_updates( 57 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d') 58 | assert len(c_method_updates) == 3 59 | 60 | test_driller.clean() 61 | -------------------------------------------------------------------------------- /tests/test_csv_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from py2neo import NodeMatcher, RelationshipMatcher 18 | from graphrepo.drillers import Driller 19 | from graphrepo.mappers import CSVMapper 20 | from graphrepo.miners import CommitMiner 21 | 22 | 23 | class TestCSVMapper: 24 | """Most data is indexed when indexing a commmit 25 | so this class tests indexing for multiple models""" 26 | 27 | def test_csv_mapper(self): 28 | folder = os.path.dirname(os.path.abspath(__file__)) 29 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 30 | test_driller.drill_batch() 31 | 32 | n_matcher = NodeMatcher(test_driller.graph) 33 | r_matcher = RelationshipMatcher(test_driller.graph) 34 | 35 | com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher) 36 | mapper = CSVMapper() 37 | 38 | commits = com_miner.get_all() 39 | mapped_commits = mapper.map(commits) 40 | assert mapped_commits.shape == (8, 9) 41 | 42 | c_files = com_miner.get_commit_files( 43 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d') 44 | c_csv = mapper.map(c_files) 45 | assert c_csv.shape == (3, 5) 46 | 47 | test_driller.clean() 48 | -------------------------------------------------------------------------------- /tests/test_db_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from graphrepo.drillers.default import DefaultDriller 4 | import graphrepo.drillers.db_init as db_init 5 | 6 | from py2neo.database import Schema 7 | 8 | 9 | class TestDBInit: 10 | def test_hash_constraints(self): 11 | folder = os.path.dirname(os.path.abspath(__file__)) 12 | test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml')) 13 | 14 | db_init.create_hash_constraints(test_driller.graph) 15 | 16 | schm = Schema(test_driller.graph) 17 | 18 | labels = ["Developer", "Branch", "Commit", "File", "Method"] 19 | 20 | for l in labels: 21 | c = schm.get_uniqueness_constraints(l) 22 | assert len(c) == 1 23 | 24 | # clean 25 | for l in labels: 26 | schm.drop_uniqueness_constraint(l, 'hash') 27 | 28 | def test_indices(self): 29 | folder = os.path.dirname(os.path.abspath(__file__)) 30 | test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml')) 31 | 32 | db_init.create_indices(test_driller.graph, hash_index=True) 33 | 34 | schm = Schema(test_driller.graph) 35 | 36 | index_authors = schm.get_indexes("Developer") 37 | assert len(index_authors) == 1 38 | 39 | index_branch = schm.get_indexes("Branch") 40 | assert len(index_branch) == 2 41 | 42 | index_commits = schm.get_indexes("Commit") 43 | assert len(index_commits) == 2 44 | 45 | index_files = schm.get_indexes("File") 46 | assert len(index_files) == 3 47 | 48 | index_methods = schm.get_indexes("Method") 49 | assert len(index_methods) == 3 50 | 51 | # clean 52 | schm.drop_index("Developer", "hash") 53 | schm.drop_index("Branch", "hash") 54 | schm.drop_index("Branch", "project_id") 55 | schm.drop_index("Commit", "hash") 56 | schm.drop_index("Commit", "project_id") 57 | schm.drop_index("File", "hash") 58 | schm.drop_index("File", "project_id") 59 | schm.drop_index("Method", "hash") 60 | schm.drop_index("Method", "project_id") 61 | -------------------------------------------------------------------------------- /tests/test_dev_miner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from datetime import datetime 16 | import os 17 | 18 | from py2neo import NodeMatcher, RelationshipMatcher 19 | from graphrepo.drillers.driller import Driller 20 | from graphrepo.miners.developer import DeveloperMiner 21 | 22 | 23 | class TestDevMiner: 24 | def test_gets(self): 25 | folder = os.path.dirname(os.path.abspath(__file__)) 26 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 27 | test_driller.drill_batch() 28 | 29 | st_date = datetime.strptime( 30 | "14 May, 2020 00:00", '%d %B, %Y %H:%M').timestamp() 31 | end_date = datetime.strptime( 32 | "15 May, 2020 02:00", '%d %B, %Y %H:%M').timestamp() 33 | 34 | n_matcher = NodeMatcher(test_driller.graph) 35 | r_matcher = RelationshipMatcher(test_driller.graph) 36 | 37 | dev_miner = DeveloperMiner(test_driller.graph, n_matcher, r_matcher) 38 | 39 | all_devs = dev_miner.get_all() 40 | assert len(all_devs) == 2 41 | 42 | all_commits = dev_miner.get_commits( 43 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb") 44 | assert len(all_commits) == 7 45 | 46 | all_com_id = dev_miner.get_commits( 47 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 48 | project_id=test_driller.config.ct.project_id 49 | ) 50 | assert len(all_com_id) == 7 51 | 52 | all_com_id_dates = dev_miner.get_commits( 53 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 54 | project_id=test_driller.config.ct.project_id, 55 | start_date=st_date, 56 | end_date=end_date 57 | ) 58 | assert len(all_com_id_dates) == 7 59 | 60 | all_files = dev_miner.get_files( 61 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb" 62 | ) 63 | assert len(all_files) == 6 64 | 65 | all_files_id_dates = dev_miner.get_files( 66 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 67 | project_id=test_driller.config.ct.project_id, 68 | start_date=st_date, 69 | end_date=end_date 70 | ) 71 | assert len(all_files_id_dates) == 6 72 | 73 | files_updates = dev_miner.get_files_updates( 74 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb" 75 | ) 76 | assert len(files_updates) == 9 77 | 78 | files_updates_id_dates = dev_miner.get_files_updates( 79 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 80 | project_id=test_driller.config.ct.project_id, 81 | start_date=st_date, 82 | end_date=end_date 83 | ) 84 | assert len(files_updates_id_dates) == 9 85 | 86 | all_methods = dev_miner.get_methods( 87 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb" 88 | ) 89 | assert len(all_methods) == 5 90 | 91 | all_methods_id_dates = dev_miner.get_methods( 92 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 93 | project_id=test_driller.config.ct.project_id, 94 | start_date=st_date, 95 | end_date=end_date 96 | ) 97 | assert len(all_methods_id_dates) == 5 98 | 99 | method_updates = dev_miner.get_method_updates( 100 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb" 101 | ) 102 | assert len(method_updates) == 9 103 | 104 | method_updates_id_dates = dev_miner.get_method_updates( 105 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb", 106 | project_id=test_driller.config.ct.project_id, 107 | start_date=st_date, 108 | end_date=end_date 109 | ) 110 | assert len(method_updates_id_dates) == 9 111 | 112 | test_driller.clean() 113 | -------------------------------------------------------------------------------- /tests/test_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from graphrepo.drillers.driller import Driller 18 | 19 | 20 | class TestDriller: 21 | def test_configure(self): 22 | folder = os.path.dirname(os.path.abspath(__file__)) 23 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml')) 24 | 25 | assert test_driller.config.ct.db_url == 'localhost' 26 | assert test_driller.config.ct.repo == 'tests/gr-test' 27 | 28 | assert test_driller.graph is not None 29 | 30 | def test_indexing(self): 31 | folder = os.path.dirname(os.path.abspath(__file__)) 32 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml')) 33 | test_driller.drill_batch() 34 | records = [r for r in test_driller.graph.run( 35 | "MATCH(n) RETURN n")] 36 | assert len(records) == 22 37 | 38 | test_driller.clean() 39 | 40 | def test_index_save(self): 41 | folder = os.path.dirname(os.path.abspath(__file__)) 42 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml')) 43 | test_driller.drill_batch(save_path='data/graphrepo.json') 44 | records = [r for r in test_driller.graph.run( 45 | "MATCH(n) RETURN n")] 46 | assert len(records) == 22 47 | 48 | test_driller.clean() 49 | 50 | test_driller.index_from_file(file_path='data/graphrepo.json') 51 | records = [r for r in test_driller.graph.run( 52 | "MATCH(n) RETURN n")] 53 | assert len(records) == 22 54 | 55 | os.remove('data/graphrepo.json') 56 | test_driller.clean() 57 | -------------------------------------------------------------------------------- /tests/test_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/test_file.py -------------------------------------------------------------------------------- /tests/test_file_miner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from py2neo import NodeMatcher, RelationshipMatcher 18 | from graphrepo.drillers.driller import Driller 19 | from graphrepo.miners.file import FileMiner 20 | 21 | 22 | class TestFileMiner: 23 | def test_get_all(self): 24 | folder = os.path.dirname(os.path.abspath(__file__)) 25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 26 | test_driller.drill_batch() 27 | 28 | n_matcher = NodeMatcher(test_driller.graph) 29 | r_matcher = RelationshipMatcher(test_driller.graph) 30 | 31 | f_miner = FileMiner(test_driller.graph, n_matcher, r_matcher) 32 | 33 | all_files = f_miner.get_all() 34 | assert len(all_files) == 6 35 | 36 | # get readme file 37 | readme = f_miner.query(name='README.MD') 38 | assert readme['name'] == 'README.MD' 39 | 40 | # get file history 41 | f_hash = 'f85f4af5b20ddd617f93da13c7789a65fb972e68a8d634d5f253abab' 42 | update_history = f_miner.get_change_history(f_hash) 43 | assert len(update_history) == 3 44 | 45 | # test file get methods 46 | current_m = f_miner.get_current_methods(f_hash) 47 | assert len(current_m) == 2 48 | 49 | test_driller.clean() 50 | -------------------------------------------------------------------------------- /tests/test_method_miner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from py2neo import NodeMatcher, RelationshipMatcher 18 | from graphrepo.drillers.driller import Driller 19 | from graphrepo.miners.method import MethodMiner 20 | 21 | 22 | class TestMethodMiner: 23 | def test_get_all(self): 24 | folder = os.path.dirname(os.path.abspath(__file__)) 25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml')) 26 | test_driller.drill_batch() 27 | 28 | n_matcher = NodeMatcher(test_driller.graph) 29 | r_matcher = RelationshipMatcher(test_driller.graph) 30 | 31 | m_miner = MethodMiner(test_driller.graph, n_matcher, r_matcher) 32 | 33 | all_methods = m_miner.get_all() 34 | assert len(all_methods) == 5 35 | m_hash = '45ce8dcd8b0cd8ed42e592ce828ab6418e7c79713b8dc99805bcb7ea' 36 | met = m_miner.query(hash=m_hash) 37 | assert met['name'] == 'get_name' 38 | 39 | history = m_miner.get_change_history(m_hash) 40 | assert len(history) == 2 41 | 42 | test_driller.clean() 43 | -------------------------------------------------------------------------------- /tests/test_queue_driller.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | from graphrepo.drillers.queue_driller import QueueDriller 18 | 19 | 20 | # class TestQueueDriller: 21 | # def test_indexing(self): 22 | # folder = os.path.dirname(os.path.abspath(__file__)) 23 | # test_driller = QueueDriller(os.path.join(folder, 'cnfg_init.yml')) 24 | # test_driller.drill_batch() 25 | # records = [r for r in test_driller.graph.run( 26 | # "MATCH(n) RETURN n")] 27 | # assert len(records) == 22 28 | 29 | # test_driller.clean() 30 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 NullConvergence 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from graphrepo.utils import parse_config 17 | 18 | 19 | class TestUtils: 20 | def test_parse_config(self): 21 | folder = os.path.dirname(os.path.abspath(__file__)) 22 | neo, project = parse_config(os.path.join(folder, 'cnfg_init.yml')) 23 | assert neo['db_url'] == 'localhost' 24 | assert neo['db_user'] == 'neo4j' 25 | assert project['repo'] == 'tests/gr-test' 26 | --------------------------------------------------------------------------------