├── .bettercodehub.yml
├── .gitignore
├── .gitmodules
├── .pylintrc
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── GraphRepoArch.svg
    │   ├── GraphRepoArch_old.svg
    │   ├── GraphRepoDS.svg
    │   ├── GraphRepoSchema.svg
    │   ├── _templates
    │       └── breadcrumbs.html
    │   ├── architecture.rst
    │   ├── conf.py
    │   ├── configuration.rst
    │   ├── css
    │       └── custom.css
    │   ├── data_structure.rst
    │   ├── driller.rst
    │   ├── examples.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── mappers.rst
    │   └── miners.rst
├── examples
    ├── __init__.py
    ├── all_method_complexity.py
    ├── benchmarks
    │   ├── all_data.py
    │   ├── all_methods_complexity.py
    │   ├── dev_files.py
    │   ├── dev_methods.py
    │   └── file_nloc.py
    ├── configs
    │   ├── graphrepo.yml
    │   ├── grepo-test.yml
    │   ├── hadoop.yml
    │   ├── jax.yml
    │   ├── kibana.yml
    │   ├── pydriller.yml
    │   └── tensorflow.yml
    ├── dev_data.py
    ├── file_complexity.py
    ├── index_all.py
    └── mine_all.py
├── graphrepo
    ├── __init__.py
    ├── config.py
    ├── drillers
    │   ├── __init__.py
    │   ├── batch_utils.py
    │   ├── cache_driller.py
    │   ├── db_init.py
    │   ├── default.py
    │   ├── delete_all.py
    │   ├── drill_cache.py
    │   ├── driller.py
    │   ├── queue_driller.py
    │   ├── rabbit_driller.py
    │   └── stomp_driller.py
    ├── logger.py
    ├── mappers
    │   ├── __init__.py
    │   ├── csv.py
    │   └── default.py
    ├── miners
    │   ├── __init__.py
    │   ├── commit.py
    │   ├── default.py
    │   ├── developer.py
    │   ├── file.py
    │   ├── method.py
    │   ├── mine_manager.py
    │   └── utils.py
    ├── singleton.py
    └── utils.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── cnfg_init.yml
    ├── cnfg_simple.yml
    ├── test_cache_driller.py
    ├── test_commit.py
    ├── test_commit_miner.py
    ├── test_csv_mapper.py
    ├── test_db_init.py
    ├── test_dev_miner.py
    ├── test_driller.py
    ├── test_file.py
    ├── test_file_miner.py
    ├── test_method_miner.py
    ├── test_queue_driller.py
    └── test_utils.py


/.bettercodehub.yml:
--------------------------------------------------------------------------------
1 | component_depth: 1
2 | languages:
3 | - python
4 | exclude:
5 | - /examples/.*
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # VSCode
  2 | .vscode/
  3 | 
  4 | # repo
  5 | repos/
  6 | data/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/gr-test"]
2 | 	path = tests/gr-test
3 | 	url = https://github.com/NullConvergence/gr-test
4 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=CVS
 13 | 
 14 | # Add files or directories matching the regex patterns to the blacklist. The
 15 | # regex matches against base names, not paths.
 16 | ignore-patterns=
 17 | 
 18 | # Pickle collected data for later comparisons.
 19 | persistent=yes
 20 | 
 21 | # List of plugins (as comma separated values of python modules names) to load,
 22 | # usually to register additional checkers.
 23 | load-plugins=
 24 | 
 25 | # Use multiple processes to speed up Pylint.
 26 | jobs=1
 27 | 
 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 29 | # active Python interpreter and may run arbitrary code.
 30 | unsafe-load-any-extension=no
 31 | 
 32 | # A comma-separated list of package or module names from where C extensions may
 33 | # be loaded. Extensions are loading into the active Python interpreter and may
 34 | # run arbitrary code
 35 | extension-pkg-whitelist=numpy
 36 | 
 37 | # Allow optimization of some AST trees. This will activate a peephole AST
 38 | # optimizer, which will apply various small optimizations. For instance, it can
 39 | # be used to obtain the result of joining multiple strings with the addition
 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 42 | # AST will be different than the one from reality. This option is deprecated
 43 | # and it will be removed in Pylint 2.0.
 44 | optimize-ast=no
 45 | 
 46 | 
 47 | [MESSAGES CONTROL]
 48 | 
 49 | # Only show warnings with the listed confidence levels. Leave empty to show
 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 51 | confidence=
 52 | 
 53 | # Enable the message, report, category or checker with the given id(s). You can
 54 | # either give multiple identifier separated by comma (,) or put this option
 55 | # multiple time (only on the command line, not in the configuration file where
 56 | # it should appear only once). See also the "--disable" option for examples.
 57 | #enable=
 58 | 
 59 | # Disable the message, report, category or checker with the given id(s). You
 60 | # can either give multiple identifiers separated by comma (,) or put this
 61 | # option multiple times (only on the command line, not in the configuration
 62 | # file where it should appear only once).You can also use "--disable=all" to
 63 | # disable everything first and then reenable specific checks. For example, if
 64 | # you want to run only the similarities checker, you can use "--disable=all
 65 | # --enable=similarities". If you want to run only the classes checker, but have
 66 | # no Warning level messages displayed, use"--disable=all --enable=classes
 67 | # --disable=W"
 68 | disable=long-suffix,standarderror-builtin,indexing-exception,delslice-method,unichr-builtin,dict-view-method,parameter-unpacking,unicode-builtin,cmp-builtin,intern-builtin,round-builtin,backtick,nonzero-method,xrange-builtin,coerce-method,raw_input-builtin,old-division,filter-builtin-not-iterating,old-octal-literal,input-builtin,map-builtin-not-iterating,buffer-builtin,basestring-builtin,zip-builtin-not-iterating,using-cmp-argument,unpacking-in-except,old-raise-syntax,coerce-builtin,dict-iter-method,hex-method,range-builtin-not-iterating,useless-suppression,cmp-method,print-statement,reduce-builtin,file-builtin,long-builtin,getslice-method,execfile-builtin,no-absolute-import,metaclass-assignment,oct-method,reload-builtin,import-star-module-level,suppressed-message,apply-builtin,raising-string,next-method-called,setslice-method,old-ne-operator,arguments-differ,wildcard-import,locally-disabled
 69 | 
 70 | 
 71 | [REPORTS]
 72 | 
 73 | # Set the output format. Available formats are text, parseable, colorized, msvs
 74 | # (visual studio) and html. You can also give a reporter class, eg
 75 | # mypackage.mymodule.MyReporterClass.
 76 | output-format=text
 77 | 
 78 | # Put messages in a separate file for each module / package specified on the
 79 | # command line instead of printing them on stdout. Reports (if any) will be
 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated
 81 | # and it will be removed in Pylint 2.0.
 82 | files-output=no
 83 | 
 84 | # Tells whether to display a full report or only the messages
 85 | reports=yes
 86 | 
 87 | # Python expression which should return a note less than 10 (10 is the highest
 88 | # note). You have access to the variables errors warning, statement which
 89 | # respectively contain the number of errors / warnings messages and the total
 90 | # number of statements analyzed. This is used by the global evaluation report
 91 | # (RP0004).
 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 93 | 
 94 | # Template used to display messages. This is a python new-style format string
 95 | # used to format the message information. See doc for all details
 96 | #msg-template=
 97 | 
 98 | 
 99 | [BASIC]
100 | 
101 | # Good variable names which should always be accepted, separated by a comma
102 | good-names=i,j,k,ex,Run,_
103 | 
104 | # Bad variable names which should always be refused, separated by a comma
105 | bad-names=foo,bar,baz,toto,tutu,tata
106 | 
107 | # Colon-delimited sets of names that determine each other's naming style when
108 | # the name regexes allow several styles.
109 | name-group=
110 | 
111 | # Include a hint for the correct naming format with invalid-name
112 | include-naming-hint=no
113 | 
114 | # List of decorators that produce properties, such as abc.abstractproperty. Add
115 | # to this list to register other decorators that produce valid properties.
116 | property-classes=abc.abstractproperty
117 | 
118 | # Regular expression matching correct variable names
119 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
120 | 
121 | # Naming hint for variable names
122 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
123 | 
124 | # Regular expression matching correct class attribute names
125 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
126 | 
127 | # Naming hint for class attribute names
128 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
129 | 
130 | # Regular expression matching correct argument names
131 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
132 | 
133 | # Naming hint for argument names
134 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
135 | 
136 | # Regular expression matching correct module names
137 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
138 | 
139 | # Naming hint for module names
140 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
141 | 
142 | # Regular expression matching correct constant names
143 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
144 | 
145 | # Naming hint for constant names
146 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
147 | 
148 | # Regular expression matching correct inline iteration names
149 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
150 | 
151 | # Naming hint for inline iteration names
152 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
153 | 
154 | # Regular expression matching correct method names
155 | method-rgx=[a-z_][a-z0-9_]{2,30}$
156 | 
157 | # Naming hint for method names
158 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
159 | 
160 | # Regular expression matching correct function names
161 | function-rgx=[a-z_][a-z0-9_]{2,30}$
162 | 
163 | # Naming hint for function names
164 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
165 | 
166 | # Regular expression matching correct attribute names
167 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
168 | 
169 | # Naming hint for attribute names
170 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
171 | 
172 | # Regular expression matching correct class names
173 | class-rgx=[A-Z_][a-zA-Z0-9]+$
174 | 
175 | # Naming hint for class names
176 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
177 | 
178 | # Regular expression which should only match function or class names that do
179 | # not require a docstring.
180 | no-docstring-rgx=^test_
181 | 
182 | # Minimum line length for functions/classes that require docstrings, shorter
183 | # ones are exempt.
184 | docstring-min-length=-1
185 | 
186 | 
187 | [ELIF]
188 | 
189 | # Maximum number of nested blocks for function / method body
190 | max-nested-blocks=5
191 | 
192 | 
193 | [FORMAT]
194 | 
195 | # Maximum number of characters on a single line.
196 | max-line-length=80
197 | 
198 | # Regexp for a line that is allowed to be longer than the limit.
199 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
200 | 
201 | # Allow the body of an if to be on the same line as the test if there is no
202 | # else.
203 | single-line-if-stmt=y
204 | 
205 | # List of optional constructs for which whitespace checking is disabled. `dict-
206 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
207 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
208 | # `empty-line` allows space-only lines.
209 | no-space-check=trailing-comma,dict-separator
210 | 
211 | # Maximum number of lines in a module
212 | max-module-lines=1000
213 | 
214 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
215 | # tab).
216 | indent-string='  '
217 | 
218 | # Number of spaces of indent required inside a hanging  or continued line.
219 | indent-after-paren=4
220 | 
221 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
222 | expected-line-ending-format=
223 | 
224 | 
225 | [LOGGING]
226 | 
227 | # Logging modules to check that the string format arguments are in logging
228 | # function parameter format
229 | logging-modules=logging
230 | 
231 | 
232 | [MISCELLANEOUS]
233 | 
234 | # List of note tags to take in consideration, separated by a comma.
235 | notes=FIXME,XXX,TODO
236 | 
237 | 
238 | [SIMILARITIES]
239 | 
240 | # Minimum lines number of a similarity.
241 | min-similarity-lines=10
242 | 
243 | # Ignore comments when computing similarities.
244 | ignore-comments=yes
245 | 
246 | # Ignore docstrings when computing similarities.
247 | ignore-docstrings=yes
248 | 
249 | # Ignore imports when computing similarities.
250 | ignore-imports=no
251 | 
252 | 
253 | [SPELLING]
254 | 
255 | # Spelling dictionary name. Available dictionaries: none. To make it working
256 | # install python-enchant package.
257 | spelling-dict=
258 | 
259 | # List of comma separated words that should not be checked.
260 | spelling-ignore-words=
261 | 
262 | # A path to a file that contains private dictionary; one word per line.
263 | spelling-private-dict-file=
264 | 
265 | # Tells whether to store unknown words to indicated private dictionary in
266 | # --spelling-private-dict-file option instead of raising a message.
267 | spelling-store-unknown-words=no
268 | 
269 | 
270 | [TYPECHECK]
271 | 
272 | # Tells whether missing members accessed in mixin class should be ignored. A
273 | # mixin class is detected if its name ends with "mixin" (case insensitive).
274 | ignore-mixin-members=yes
275 | 
276 | # List of module names for which member attributes should not be checked
277 | # (useful for modules/projects where namespaces are manipulated during runtime
278 | # and thus existing member attributes cannot be deduced by static analysis. It
279 | # supports qualified module names, as well as Unix pattern matching.
280 | ignored-modules=
281 | 
282 | # List of class names for which member attributes should not be checked (useful
283 | # for classes with dynamically set attributes). This supports the use of
284 | # qualified names.
285 | ignored-classes=optparse.Values,thread._local,_thread._local,matplotlib.cm,tensorflow.python,tensorflow,tensorflow.train.Example,RunOptions
286 | 
287 | # List of members which are set dynamically and missed by pylint inference
288 | # system, and so shouldn't trigger E1101 when accessed. Python regular
289 | # expressions are accepted.
290 | generated-members=set_shape,np.float32
291 | 
292 | # List of decorators that produce context managers, such as
293 | # contextlib.contextmanager. Add to this list to register other decorators that
294 | # produce valid context managers.
295 | contextmanager-decorators=contextlib.contextmanager
296 | 
297 | 
298 | [VARIABLES]
299 | 
300 | # Tells whether we should check for unused import in __init__ files.
301 | init-import=no
302 | 
303 | # A regular expression matching the name of dummy variables (i.e. expectedly
304 | # not used).
305 | dummy-variables-rgx=(_+[a-zA-Z0-9_]*?$)|dummy
306 | 
307 | # List of additional names supposed to be defined in builtins. Remember that
308 | # you should avoid to define new builtins when possible.
309 | additional-builtins=
310 | 
311 | # List of strings which can identify a callback function by name. A callback
312 | # name must start or end with one of those strings.
313 | callbacks=cb_,_cb
314 | 
315 | # List of qualified module names which can have objects that can redefine
316 | # builtins.
317 | redefining-builtins-modules=six.moves,future.builtins
318 | 
319 | 
320 | [CLASSES]
321 | 
322 | # List of method names used to declare (i.e. assign) instance attributes.
323 | defining-attr-methods=__init__,__new__,setUp
324 | 
325 | # List of valid names for the first argument in a class method.
326 | valid-classmethod-first-arg=cls
327 | 
328 | # List of valid names for the first argument in a metaclass class method.
329 | valid-metaclass-classmethod-first-arg=mcs
330 | 
331 | # List of member names, which should be excluded from the protected access
332 | # warning.
333 | exclude-protected=_asdict,_fields,_replace,_source,_make
334 | 
335 | 
336 | [DESIGN]
337 | 
338 | # Maximum number of arguments for function / method
339 | max-args=10
340 | 
341 | # Argument names that match this expression will be ignored. Default to name
342 | # with leading underscore
343 | ignored-argument-names=_.*
344 | 
345 | # Maximum number of locals for function / method body
346 | max-locals=30
347 | 
348 | # Maximum number of return / yield for function / method body
349 | max-returns=6
350 | 
351 | # Maximum number of branch for function / method body
352 | max-branches=12
353 | 
354 | # Maximum number of statements in function / method body
355 | max-statements=100
356 | 
357 | # Maximum number of parents for a class (see R0901).
358 | max-parents=7
359 | 
360 | # Maximum number of attributes for a class (see R0902).
361 | max-attributes=10
362 | 
363 | # Minimum number of public methods for a class (see R0903).
364 | min-public-methods=0
365 | 
366 | # Maximum number of public methods for a class (see R0904).
367 | max-public-methods=20
368 | 
369 | # Maximum number of boolean expressions in a if statement
370 | max-bool-expr=5
371 | 
372 | 
373 | [IMPORTS]
374 | 
375 | # Deprecated modules which should not be used, separated by a comma
376 | deprecated-modules=optparse
377 | 
378 | # Create a graph of every (i.e. internal and external) dependencies in the
379 | # given file (report RP0402 must not be disabled)
380 | import-graph=
381 | 
382 | # Create a graph of external dependencies in the given file (report RP0402 must
383 | # not be disabled)
384 | ext-import-graph=
385 | 
386 | # Create a graph of internal dependencies in the given file (report RP0402 must
387 | # not be disabled)
388 | int-import-graph=
389 | 
390 | # Force import order to recognize a module as part of the standard
391 | # compatibility libraries.
392 | known-standard-library=
393 | 
394 | # Force import order to recognize a module as part of a third party library.
395 | known-third-party=enchant
396 | 
397 | # Analyse import fallback blocks. This can be used to support both Python 2 and
398 | # 3 compatible code, which means that the block might have code that exists
399 | # only in one or another interpreter, leading to false positives when analysed.
400 | analyse-fallback-blocks=no
401 | 
402 | 
403 | [EXCEPTIONS]
404 | 
405 | # Exceptions that will emit a warning when being caught. Defaults to
406 | # "Exception"
407 | overgeneral-exceptions=Exception
408 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GraphRepo ![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square) [![BCH compliance](https://bettercodehub.com/edge/badge/NullConvergence/GraphRepo?branch=develop)](https://bettercodehub.com/)
  2 | 
  3 | GraphRepo is a tool for mining software repositories in real time. It indexes Git repositories in Neo4j and implements multiple queries to select and process the repository data.
  4 | 
  5 | For a complete description, see the [online documentation](https://graphrepo.readthedocs.io/en/latest/).
  6 | <!-- For a [demo](https://github.com/NullConvergence/GraphRepo-Demo) using Jupyter notebooks follow this [link](https://github.com/NullConvergence/GraphRepo-Demo) or see the [video demo](https://www.youtube.com/watch?v=x1ha0fRltGI). -->
  7 | 
  8 | <p align="center">
  9 |   <img src="https://raw.githubusercontent.com/NullConvergence/GraphRepo/develop/docs/source/GraphRepoSchema.svg">
 10 | </p>x
 11 | 
 12 | ###  1. Installation & First run
 13 | 
 14 | #### 1.1 Prereq
 15 | The only requirement is to have Python >=3.5 and Docker installed on your system.
 16 | 
 17 | #### 1.2 Install using pip
 18 | 
 19 | The production release can be installed using pip:
 20 | 
 21 | ```
 22 | $ pip install graphrepo
 23 | ```
 24 | 
 25 | <!--
 26 | #### Alternative: Install the development version
 27 | 
 28 | Note that the development version may have new, but unreliable or poorly documented features.
 29 | 
 30 | ```
 31 | $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo
 32 | $ cd graphrepo/
 33 | $ pip install -r requirements.txt
 34 | ```
 35 | -->
 36 | 
 37 | 
 38 | #### 1.3 Run and configure Neo4j
 39 | 
 40 | The following instructions assume the Docker daemon is running on your machine:
 41 | 
 42 | ```
 43 | $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins  -e NEO4JLABS_PLUGINS=\[\"apoc\"\]   -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11
 44 | ```
 45 | 
 46 | Open a browser window and go to [http://localhost:7474](http://localhost:7474). Here you can configure the neo4j password.
 47 | The default one is *neo4jj*.
 48 | 
 49 | ##### Optionally, configure Neo4j to allow larger heap size using the following attributes with the command above:
 50 | 
 51 | ```
 52 | --env NEO4J_dbms_memory_pagecache_size=4g
 53 | --env NEO4J_dbms_memory_heap_max__size=4g
 54 | ```
 55 | 
 56 | #### 1.4. Index and vizualize a repo
 57 | 
 58 | In order to index a repository, you must clone it on localhost, and point GraphRepo to it. For example:
 59 | ```
 60 | $ mkdir repos
 61 | $ cd repos
 62 | $ git clone https://github.com/ishepard/pydriller
 63 | ```
 64 | 
 65 | Now enter the [examples](/examples) folder from this repository, and edit the configuration file for PyDriller to reflect the database URL and desired batch size:
 66 | ```
 67 | $ cd ../examples/
 68 | $ nano configs/pydriller.yml
 69 | ```
 70 | 
 71 | Afterwards, we can run the script from the examples folder which indexes the repository in Neo4j:
 72 | 
 73 | ```
 74 | $ python -m examples.index_all --config=examples/configs/pydriller.yml
 75 | ```
 76 | 
 77 | Go to [http://localhost:7474](http://localhost:7474) and use the query from 3.1
 78 | 
 79 | 
 80 | #### 1.5. Retrieve all data from Neo4j using GraphRepo
 81 | 
 82 | Assuming you succeded in step 1.4, use the follwing command to retrieve all indexed data:
 83 | 
 84 | ```
 85 | $ python -m examples.mine_all --config=examples/configs/pydriller.yml
 86 | ```
 87 | 
 88 | 
 89 | ### 2. Examples
 90 | 
 91 | For a comprehensive introduction and more examples, see the [documentation](https://graphrepo.readthedocs.io/en/latest/examples.html).
 92 | 
 93 | 
 94 | 
 95 | ### 3. Useful Neo4j queries for the web interface
 96 | 
 97 | #### 3.1 Match all nodes in a graph
 98 | ```
 99 | MATCH (n) RETURN n
100 | ```
101 | 
102 | 
103 | #### 3.2 Delete all nodes and relationships in a graph
104 | 
105 | ```
106 | MATCH (n) DETACH DELETE n;
107 | ```
108 | 
109 | #### 3.2 Delete a limited number commits and relationship
110 | 
111 | ```
112 | MATCH (n:Commit)
113 | // Take the first 100 commits nodes and their rels
114 | WITH n LIMIT 100
115 | DETACH DELETE n
116 | RETURN count(*);
117 | ```
118 | 
119 | 
120 | 
121 | This project is enabled by [Pydriller](https://github.com/ishepard/pydriller).
122 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/GraphRepoDS.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="453px" height="218px" viewBox="-0.5 -0.5 453 218" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2020-06-04T14:52:13.673Z&quot; agent=&quot;5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15&quot; version=&quot;13.1.14&quot; etag=&quot;EWzBFuCuqGhwGErxrABz&quot; type=&quot;google&quot;&gt;&lt;diagram id=&quot;Y3R0P10BOoxSgWQ3S_0f&quot;&gt;3VrJktsgEP0aHzOlDVk+2ho7OSRVqZrKMkciYYtEFgrCW74+IIF2pzQeLeP4YmigDe91Nw14Zrr783sK4+AT8VE4MzT/PDMfZ4ahWxrgX0JyySSGBhaZZEexL3sVgif8B0mhJqUH7KOk0pEREjIcV4UeiSLksYoMUkpO1W5bElZ/NYY71BA8eTBsSr9hnwWZ1AFaIf+A8C5Qv6xrsmUPVWcpSALok1NJZK5npksJYVlpf3ZRKNBTuGTjNlda84lRFLEuAww5DXZRa0M+X6qsEsoCsiMRDNeFdEXJIfKRUKDxWtHnIyExF+pc+BMxdpG8wQMjXBSwfShb0Rmz76Xys1D1AGTt8Sw1p5WLrCSMkl852mLklkRM/oQOZN0lIaHpOkwt/eQjSy0rcw02Lm/J1i4WXEEvIQfqKZE0TAbpDik6rSbKes4dt3pE9ojRC+9CUQgZPlb1Q2l9u7xfQRAvSI7a+TKn5Kvg6LlMUStfL2OnzGuDL911LNfuxhdooWsxHV3WDXR5B3pM2dLHY4sDQy/fy5VnpU1UikFprfBJSNlSxFMuiEiElGyDBSJyhK96eCFMEuxlQtnl34Tf7vQ3msqEni2nc4ThQSp9RMeHpv2EId/lBNKnADP0FMN0PSe+01bNACZxtvVt8VmYUw1mSdeW09CG/MtwPiLK0LkCRQcI1QAdyKgm0wF74WT1U7G32hKeoLStKtlrULfva/8rUWCYru06V52kjaw6qR2cZN50Emc6H5k3fISHEdSjj9TcoYZx3XdG9BHDWVR8ZK7pD2A0L3EauH9C3Or94ZBfr4FlW69Ffstn81U5BS9/kM7UByOWPp+QkcUUcavHnG6zWS6tjhu1ro54bySpU6fQO8rqskEqrysyuXHzupz0QfO6VnMZILFLh3IM4KXUISY4YklJ82chKIUNza4mOwDUDuj1AaY6o7cP4IVsDoXp5ovpZs16I7ivKIy8YLjgXrODCbdVzdSq4DrWaCFcn/TupRpH9H/Gkes3KMM6stV05PmEYf+Wu5e+wv7LjgrDhO/pWJ/wyKGmUwqPLtnvMesxPHY5mefgv4Xct34aGTVsNi9KZoYdMglFhRP794GohndJCtqSd9BBfE6RUO28tBPfywN3UKrU/aCvUqbU8AVlE1Pimt1wHljNQEK8i4Rncj4Q53Ul2MIeDJeyYY99P9sOEJ8I/JGqEuTK9IPrBasZeBS6eEhJpLm8wHooYdyPSNSj0ejqNlRd89gtRqM1jUY9lbzKaOzhjOZzBsF/xHZrPOolbtRMACzM8UygeXnUmwmohPn/MYFekuuFOR3bzp0ka+0vL6UD+8hn9JGSvJZn1SlT+8XwSd61fO7NXnDWn2XGTPJUCCgR8iX2IUPl3CwLr60vAPcQYgfbZU2jmp3boFuiZfUQd43mbdI14q48IbxJ6gbjylK3QLmTNffI+VBcGQ2u7omTATIW06q9spl2JzZuyFh4tfiXV3ZtW/xZzlz/BQ==&lt;/diagram&gt;&lt;/mxfile&gt;"><defs/><g><path d="M 188 66.5 L 252.63 66.5" fill="none" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 257.88 66.5 L 250.88 70 L 252.63 66.5 L 250.88 63 Z" fill="#b3e5fc" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="all"/><path d="M 155.5 148 L 155.5 105.37" fill="none" stroke="#1c84c6" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 155.5 100.12 L 159 107.12 L 155.5 105.37 L 152 107.12 Z" fill="#1c84c6" stroke="#1c84c6" stroke-miterlimit="10" pointer-events="all"/><path d="M 155.5 148 Q 155.5 123.5 212 123.5 Q 268.5 123.5 268.52 95.85" fill="none" stroke="#1c84c6" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 268.52 90.6 L 272.01 97.6 L 268.52 95.85 L 265.01 97.6 Z" fill="#1c84c6" stroke="#1c84c6" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="155.5" cy="180.5" rx="32.5" ry="32.5" fill="#1c84c6" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 181px; margin-left: 124px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Dev.</div></div></div></foreignObject><text x="156" y="185" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Dev.</text></switch></g><path d="M 324 184 L 380.63 184" fill="none" stroke="#23c6c8" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 385.88 184 L 378.88 187.5 L 380.63 184 L 378.88 180.5 Z" fill="#23c6c8" stroke="#23c6c8" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="291.5" cy="184" rx="32.5" ry="32.5" fill="#23c6c8" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 184px; margin-left: 260px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">File</div></div></div></foreignObject><text x="292" y="189" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">File</text></switch></g><ellipse cx="419.5" cy="184" rx="32.5" ry="32.5" fill="#ee5464" stroke="none" transform="translate(0,184)scale(1,-1)translate(0,-184)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 184px; margin-left: 388px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Method</div></div></div></foreignObject><text x="420" y="189" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Method</text></switch></g><path d="M 65 66.5 L 116.63 66.5" fill="none" stroke="#ffaa40" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 121.88 66.5 L 114.88 70 L 116.63 66.5 L 114.88 63 Z" fill="#ffaa40" stroke="#ffaa40" stroke-miterlimit="10" pointer-events="all"/><path d="M 32.5 34 Q 32.5 0 162 0 Q 291.5 0 291.5 27.63" fill="none" stroke="#ffaa40" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 291.5 32.88 L 288 25.88 L 291.5 27.63 L 295 25.88 Z" fill="#ffaa40" stroke="#ffaa40" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="32.5" cy="66.5" rx="32.5" ry="32.5" fill="#ffaa40" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 67px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Branch</div></div></div></foreignObject><text x="33" y="71" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Branch</text></switch></g><path d="M 291.5 99 L 291.5 145.13" fill="none" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 291.5 150.38 L 288 143.38 L 291.5 145.13 L 295 143.38 Z" fill="#b3e5fc" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="all"/><path d="M 324 66.5 Q 419.5 66.5 419.5 145.13" fill="none" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 419.5 150.38 L 416 143.38 L 419.5 145.13 L 423 143.38 Z" fill="#b3e5fc" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="291.5" cy="66.5" rx="32.5" ry="32.5" fill="#b3e5fc" stroke="none" transform="translate(0,66.5)scale(1,-1)translate(0,-66.5)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 67px; margin-left: 260px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Commit</div></div></div></foreignObject><text x="292" y="71" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Commit</text></switch></g><rect x="94" y="114" width="60" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 124px; margin-left: 124px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; "><font style="font-size: 15px">Author<br style="font-size: 15px" /></font></div></div></div></foreignObject><text x="124" y="129" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Author&#xa;</text></switch></g><rect x="194" y="43" width="60" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 53px; margin-left: 224px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; "><font style="font-size: 15px">Parent</font></div></div></div></foreignObject><text x="224" y="58" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Parent</text></switch></g><rect x="63" y="43" width="60" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 53px; margin-left: 93px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; "><font style="font-size: 15px">Branch</font></div></div></div></foreignObject><text x="93" y="58" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Branch</text></switch></g><path d="M 188 66.5 Q 223.5 66.5 223.5 125.25 Q 223.5 184 252.63 184" fill="none" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 257.88 184 L 250.88 187.5 L 252.63 184 L 250.88 180.5 Z" fill="#b3e5fc" stroke="#b3e5fc" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="155.5" cy="66.5" rx="32.5" ry="32.5" fill="#b3e5fc" stroke="none" transform="translate(0,66.5)scale(1,-1)translate(0,-66.5)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 67px; margin-left: 124px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Commit</div></div></div></foreignObject><text x="156" y="71" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Commit</text></switch></g><rect x="299" y="104" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 124px; margin-left: 329px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Update<br />File</div></div></div></foreignObject><text x="329" y="129" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Update...</text></switch></g><rect x="382" y="33" width="70" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 53px; margin-left: 417px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Update<br />Method</div></div></div></foreignObject><text x="417" y="58" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Update...</text></switch></g><rect x="319" y="186" width="70" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 196px; margin-left: 354px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 15px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: nowrap; ">Method</div></div></div></foreignObject><text x="354" y="201" fill="#000000" font-family="Helvetica" font-size="15px" text-anchor="middle">Method</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://desk.draw.io/support/solutions/articles/16000042487" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/docs/source/GraphRepoSchema.svg:
--------------------------------------------------------------------------------
1 | ﻿<svg xmlns="http://www.w3.org/2000/svg" width="324.06854248046875" height="233.96873474121094" viewBox="-327.34930419921875 -19.35502052307129 324.06854248046875 233.96873474121094"><title>Neo4j Graph Visualization</title><desc>Created using Neo4j (http://www.neo4j.com/)</desc><g class="layer relationships"><g class="relationship" transform="translate(-166.49652666316302 77.11463200884832) rotate(180.9456578570127)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 6.953439040707552 24.018736134675446 L 17.306200844808384 62.65576918623819 A 17.916697508022974 17.916697508022974 0 1 1 -17.306200844808384 62.65576918623819 L -8.765172356425198 30.780216918698923 L -4.901469051268926 31.815493099109005 L -6.4704761275630185 24.148145657226706 L -11.662949835292403 30.00375978339136 L -7.79924653013613 31.039035963801442 L -16.340275018519314 62.91458823134071 A 16.916697508022974 16.916697508022974 0 1 0 16.340275018519314 62.91458823134071 L 5.987513214418485 24.277555179777966 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="0" y="94.20964923467838" transform="rotate(180 0 91.20964923467838)" font-family="sans-serif">Parent</text></g><g class="relationship" transform="translate(-298.3493048315286 76.19981344189807) rotate(360.39752216523175)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 32.958279814584586 0.5 L 32.958279814584586 -0.5 L 25 -0.5 Z M 91.89767190442834 0.5 L 99.85595171901292 0.5 L 99.85595171901292 3.5 L 106.85595171901292 0 L 99.85595171901292 -3.5 L 99.85595171901292 -0.5 L 91.89767190442834 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="62.42797585950646" y="3" font-family="sans-serif">BranchCommit</text></g><g class="relationship" transform="translate(-166.49652666316302 77.11463200884832) rotate(361.49379354879363)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 40.016825798171965 0.5 L 40.016825798171965 -0.5 L 25 -0.5 Z M 84.74409783186337 0.5 L 99.76092363003534 0.5 L 99.76092363003534 3.5 L 106.76092363003534 0 L 99.76092363003534 -3.5 L 99.76092363003534 -0.5 L 84.74409783186337 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="62.38046181501767" y="3" font-family="sans-serif">UpdateFile</text></g><g class="relationship" transform="translate(-165.71488850056295 185.61370138040547) rotate(269.5872426063386)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 35.963063427242794 0.5 L 35.963063427242794 -0.5 L 25 -0.5 Z M 65.53882140758947 0.5 L 76.50188483483227 0.5 L 76.50188483483227 3.5 L 83.50188483483227 0 L 76.50188483483227 -3.5 L 76.50188483483227 -0.5 L 65.53882140758947 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="50.750942417416134" y="3" transform="rotate(180 50.750942417416134 0)" font-family="sans-serif">Author</text></g><g class="relationship" transform="translate(-166.49652666316302 77.11463200884832) rotate(398.76468351344806)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 53.308619508725315 0.5 L 53.308619508725315 -0.5 L 25 -0.5 Z M 111.82377148626438 0.5 L 140.1323909949897 0.5 L 140.1323909949897 3.5 L 147.1323909949897 0 L 140.1323909949897 -3.5 L 140.1323909949897 -0.5 L 111.82377148626438 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="82.56619549749485" y="3" font-family="sans-serif">UpdateMethod</text></g><g class="relationship" transform="translate(-34.780381366854314 80.5494630267738) rotate(448.6276738265441)"><path class="outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 32.33711950097837 0.5 L 32.33711950097837 -0.5 L 25 -0.5 Z M 65.03408948694027 0.5 L 72.37120898791865 0.5 L 72.37120898791865 3.5 L 79.37120898791865 0 L 72.37120898791865 -3.5 L 72.37120898791865 -0.5 L 65.03408948694027 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="48.685604493959325" y="3" font-family="sans-serif">Method</text></g></g><g class="layer nodes"><g class="node" transform="translate(-298.3493048315286,76.19981344189807)"><circle class="outline" cx="0" cy="0" r="25" fill="#F79767" stroke="#f36924" stroke-width="2px"/><text text-anchor="middle" pointer-events="none" font-family="sans-serif" dy="1.5625" font-size="25" fill="#FFFFFF"/><text text-anchor="middle" pointer-events="none" y="5" font-size="10px" fill="#FFFFFF" font-family="sans-serif"> Branch</text></g><g class="node" transform="translate(-166.49652666316302,77.11463200884832)"><circle class="outline" cx="0" cy="0" r="25" fill="#57C7E3" stroke="#23b3d7" stroke-width="2px"/><text text-anchor="middle" pointer-events="none" font-family="sans-serif" dy="1.5625" font-size="25" fill="#FFFFFF"/><text text-anchor="middle" pointer-events="none" y="5" font-size="10px" fill="#FFFFFF" font-family="sans-serif"> Commit</text></g><g class="node" transform="translate(-165.71488850056295,185.61370138040547)"><circle class="outline" cx="0" cy="0" r="25" fill="#F16667" stroke="#eb2728" stroke-width="2px"/><text text-anchor="middle" pointer-events="none" font-family="sans-serif" dy="1.5625" font-size="25" fill="#FFFFFF"/><text text-anchor="middle" pointer-events="none" y="5" font-size="10px" fill="#FFFFFF" font-family="sans-serif"> Developer</text></g><g class="node" transform="translate(-34.780381366854314,80.5494630267738)"><circle class="outline" cx="0" cy="0" r="25" fill="#D9C8AE" stroke="#c0a378" stroke-width="2px"/><text text-anchor="middle" pointer-events="none" font-family="sans-serif" dy="1.5625" font-size="25" fill="#604A0E"/><text text-anchor="middle" pointer-events="none" y="5" font-size="10px" fill="#604A0E" font-family="sans-serif"> File</text></g><g class="node" transform="translate(-32.28076187232737,184.89073562770756)"><circle class="outline" cx="0" cy="0" r="25" fill="#8DCC93" stroke="#5db665" stroke-width="2px"/><text text-anchor="middle" pointer-events="none" font-family="sans-serif" dy="1.5625" font-size="25" fill="#604A0E"/><text text-anchor="middle" pointer-events="none" y="5" font-size="10px" fill="#604A0E" font-family="sans-serif"> Method</text></g></g></svg>


--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html:
--------------------------------------------------------------------------------
1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %}
2 | 
3 | {% block breadcrumbs_aside %}
4 | {% endblock %}


--------------------------------------------------------------------------------
/docs/source/architecture.rst:
--------------------------------------------------------------------------------
 1 | .. _architecture_toplevel:
 2 | 
 3 | ==================
 4 | Architecture
 5 | ==================
 6 | 
 7 | GraphRepo consists of 3 main components:
 8 | 
 9 | * :ref:`DRILLERS` - components used to parse data from a git repository and insert records in Neo4j,
10 | * :ref:`MINERS` and MinerManager - components which hold default queries and interfaces for retrieving data from Neo4j, and
11 | * :ref:`MAPPERS` - components used to transform the data retrieved by Miners in specific format, filter or sort data.
12 | 
13 | The advantage of using custom mappers is that the load on Neo4j can be decreased,
14 | using lighter queries to extract the data and more intensive data processing in the
15 | custom mappers. For example, one can write a mapper using PySpark on raw data extracted
16 | from Neo4j and use the Apache Spark engine for scalability.
17 | 
18 | .. image:: /GraphRepoArch.svg
19 |    :width: 400
20 |    :align: center
21 | 
22 | 
23 | Specific information about each component can be found using the links above.


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | import os
18 | import sys
19 | sys.path.insert(0, os.path.abspath('../'))
20 | 
21 | 
22 | # -- Project information -----------------------------------------------------
23 | 
24 | project = 'GraphRepo'
25 | copyright = '2021, GraphRepo'
26 | author = 'GraphRepo'
27 | 
28 | # The full version, including alpha/beta/rc tags
29 | version = ''
30 | release = '1.0.0'
31 | 
32 | 
33 | # -- General configuration ---------------------------------------------------
34 | 
35 | master_doc = 'index'
36 | 
37 | # Add any Sphinx extension module names here, as strings. They can be
38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
39 | # ones.
40 | extensions = ['sphinx.ext.autodoc',
41 |               'sphinx.ext.doctest']
42 | 
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 | 
46 | # List of patterns, relative to source directory, that match files and
47 | # directories to ignore when looking for source files.
48 | # This pattern also affects html_static_path and html_extra_path.
49 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
50 | 
51 | 
52 | # -- Options for HTML output -------------------------------------------------
53 | 
54 | # The theme to use for HTML and HTML Help pages.  See the documentation for
55 | # a list of builtin themes.
56 | #
57 | html_theme = 'sphinx_rtd_theme'
58 | 
59 | # Add any paths that contain custom static files (such as style sheets) here,
60 | # relative to this directory. They are copied after the builtin static files,
61 | # so a file named "default.css" will overwrite the builtin "default.css".
62 | html_static_path = ['_static']
63 | 
64 | 
65 | html_css_files = [
66 |     'css/custom.css',
67 | ]


--------------------------------------------------------------------------------
/docs/source/configuration.rst:
--------------------------------------------------------------------------------
 1 | .. _CONFIGURATION:
 2 | 
 3 | ==================
 4 | Configuration
 5 | ==================
 6 | 
 7 | For any activity, GraphRepo uses a yaml (.yml) configuration with 2 objects:
 8 | 
 9 | * a Neo4j instance configuration, and
10 | * a repository configuration,
11 | 
12 | as follows::
13 | 
14 |     neo:
15 |       db_url: localhost # the url for the Neo4j database
16 |       port: 7687 # the Neo4j port
17 |       db_user: neo4j # Neo4j authentication username
18 |       db_pwd: neo4jj # Neo4j authentication password
19 |       batch_size: 100 # the batch size for inserting the records in Neo4j - this setting depends on the Neo4j resources
20 | 
21 |     project:
22 |       repo: "repos/graphrepo/" # the repository filepath
23 |       start_date: "1 February, 2018" # the start date for indexing (leave empty if it corresponds with the initial start date of the project)
24 |       end_date: "30 March, 2018" # the start date for indexing (leave empty if it corresponds with the last commit)
25 |       project_id: "graphrepo" # a unique project id for the database
26 |       index_code: False # boolean, if True GraphRepo indexes for each file touched by a commit the source code before and after the commit. This parameter significantly increases the index time and the hardware resources needed for Neo4j. For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True.
27 |       index_developer_email: True # boolean, if True, GraphRepo indexes the developer emails in the Developer node. Turn flag off for GDPR or any other privacy concerns
28 | 
29 | 
30 | 
31 | Neo4j configuration
32 | ====================
33 | 
34 | GraphRepo connects to Neo4j using the Bold REST API from `py2neo <https://py2neo.org/v4/>`_.
35 | Currently the only attributes needed to connect to Neo4j are the url+port and the authentication credentials.
36 | All other configurations (e.g., setting the user permissions) are done on the database side.
37 | 
38 | 
39 | Repository configuration
40 | ========================
41 | 
42 | In order to insert a repository in the database, it has to be cloned on the local machine (where GraphRepo will run).
43 | Afterwards, it can be linked with GraphRepo using the ``project.repo`` attribute in the config file.
44 | 
45 | If one does not want to use all the repository data (e.g., if the repository is very large), it can configure
46 | the index dates using the ``project.start_date`` and ``project.end_date`` attributes.
47 | 
48 | The ``project.project_id`` attribute is used to give each project a unique identifier.
49 | Currently, GraphRepo indexes all repositories in the same database, in order to allow information about teams of developers that work
50 | on distinct projects to be mined without merging databases.
51 | 
52 | 
53 | The ``project.index_code`` attribute decides if GraphRepo indexes, for each file touched by a commit, the source code before and after the commit.
54 | This parameter significantly increases the index time and the hardware resources needed for Neo4j.
55 | For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True.
56 | 
57 | 
58 | For examples of config files, see the projects repository, ``examples/configs/pydriller.yml``.
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/docs/source/css/custom.css:
--------------------------------------------------------------------------------
1 | /* Hide "On GitHub" section from versions menu */
2 | div.rst-versions>div.rst-other-versions>div.injected>dl:nth-child(4) {
3 |   display: none;
4 | }
5 | 
6 | .wy-breadcrumbs-aside {
7 | 
8 |   display: none;
9 | }


--------------------------------------------------------------------------------
/docs/source/data_structure.rst:
--------------------------------------------------------------------------------
  1 | .. _DS:
  2 | 
  3 | ==================
  4 | Schema
  5 | ==================
  6 | 
  7 | The resulting Neo4j schema consists of 5 node types and 6 relationship types, as illustrated below:
  8 | 
  9 | .. figure:: /GraphRepoSchema.svg
 10 |    :width: 45 %
 11 |    :align: center
 12 | 
 13 | Nodes
 14 | ===========
 15 | 
 16 | 
 17 | Branch
 18 | -----------
 19 | 
 20 | Each branch identified by PyDriller is indexed as a node with the following attributes::
 21 | 
 22 |   {
 23 |     "hash": "string - unique identifier",
 24 |     "project_id": "string - project id from config (can be used to select all branches from a project)",
 25 |     "name": "string - branch name",
 26 |   }
 27 | 
 28 | Commit
 29 | -----------
 30 | 
 31 | Each commit is indexed as a node with the following attributes::
 32 | 
 33 |   {
 34 |     "hash": "string - unique identifier in Neo4j",
 35 |     "commit_hash": "string - commit hash in git",
 36 |     "message": "string - commit message in git",
 37 |     "is_merge": "int - 1 if the commit is merge, 0 otherwise",
 38 |     "timestamp": "int - Unix epoch, time of the commit",
 39 |     "project_id": "string - project id from config (can be used to select all branches from a project)",
 40 |     "dmm_unit_complexity": "int, see Pydriller",
 41 |     "dmm_unit_interfacing": "int, see Pydriller",
 42 |     "dmm_unit_size": "int, see Pydriller"
 43 |   }
 44 | 
 45 | 
 46 | 
 47 | Developer
 48 | -----------
 49 | 
 50 | Each developer is indexed as a node with the following attributes::
 51 | 
 52 |   {
 53 |     "hash": "string - unique identifier",
 54 |     "name": "string - developer name as in git",
 55 |     "email": "string - developer email as in git",
 56 |   }
 57 | 
 58 | Currently the mail and email information is not anonymized.
 59 | 
 60 | File
 61 | -----------
 62 | 
 63 | 
 64 | Each file is indexed as a node with the following attributes::
 65 | 
 66 |   {
 67 |     "hash": "string - unique identifier",
 68 |     "name": "string - file short name as in git",
 69 |     "project_id": "string - project id from config (can be used to select all branches from a project)",
 70 |     "type": "string - file extension, e.g., '.py'"
 71 |   }
 72 | 
 73 | 
 74 | 
 75 | Method
 76 | -----------
 77 | 
 78 | Each method is indexed as a node with the following attributes::
 79 | 
 80 |   {
 81 |     "hash": "string - unique identifier",
 82 |     "name": "string - method name as in file",
 83 |     "file_name": "string - parent file name",
 84 |     "project_id": "string - project id from config (can be used to select all branches from a project)",
 85 |     "type": "string - file extension, e.g., '.py'"
 86 |   }
 87 | 
 88 | 
 89 | 
 90 | Relationships
 91 | ===============
 92 | 
 93 | Author
 94 | -----------
 95 | 
 96 | An Author relationship exists between each commit and its author.
 97 | The direction is from Commit to Author and the relationship attributes are::
 98 | 
 99 |   {
100 |     "timestamp": "int - Unix epoch, time of the commit"
101 |   }
102 | 
103 | 
104 | BranchCommit
105 | --------------
106 | A BranchCommit relationship exists between each branch and the branch commits.
107 | The direction is from Branch to Commit. This relationship does not have any special attributes.
108 | 
109 | 
110 | Method
111 | -----------
112 | 
113 | An Method relationship exists between each file and its methods.
114 | The direction is from File to Method. This relationship does not have any special attributes.
115 | In order to find out if the method is still part of the file or it was deleted, we can use the FileMiner.
116 | 
117 | 
118 | Parent
119 | -----------
120 | A parent relationship exists between each commit its parent/parents.
121 | This relationship does not have any special attributes.
122 | 
123 | 
124 | UpdateFile
125 | -----------
126 | 
127 | An UpdateFile relationship exists between a commit that edited a file and the edited file.
128 | The direction is from Commit to File and the relationship attributes are::
129 | 
130 |   {
131 |     "timestamp": "int - Unix epoch, time of the commit",
132 |     "old_path": "string - old path, if the file was moved (see type attribute)",
133 |     "path": "string - current file path",
134 |     "diff": "string - commit diff",
135 |     "source_code": "string - source code after the commit",
136 |     "source_code_before": "string - source before after the commit",
137 |     "nloc": "int - file lines of code after the commit",
138 |     "complexity": "int - file complexity after the commit",
139 |     "token_count": "int - number of tokens after the commit",
140 |     "added": "int - number of lines added in commit",
141 |     "removed": "int - number of lines removed in commit",
142 |     "type": "string - type of update. Possible values are: 'ADD', 'COPY', 'RENAME', 'DELETE', 'MODIFY', 'UNKNOWN' "
143 |   }
144 | 
145 | 
146 | UpdateMethod
147 | -------------
148 | 
149 | An UpdateMethod relationship exists between a commit that edited a method and the edited method.
150 | The direction is from Commit to Method and the relationship attributes are::
151 | 
152 |   {
153 |     "timestamp": "int - Unix epoch, time of the commit",
154 |     "long_name": "string - method long name, including parameters",
155 |     "parameters": "string - method parameters",
156 |     "complexity": "int - method complexity, after commit",
157 |     "nloc": "int - method lines of code, after commit",
158 |     "fan_in": "int - method fan in, after commit",
159 |     "fan_out": "int - method fan out, after commit",
160 |     "general_fan_out": "int -method general fan out, after commit",
161 |     "length": "int -method general fan out, after commit",
162 |     "token_count": "int -method nr of tokens, after commit",
163 |     "start_line": "int -method start line, after commit",
164 |     "end_line": "int -method end line, after commit",
165 |  }
166 | 


--------------------------------------------------------------------------------
/docs/source/driller.rst:
--------------------------------------------------------------------------------
 1 | .. _DRILLERS:
 2 | 
 3 | ==================
 4 | Drillers
 5 | ==================
 6 | 
 7 | All Drillers parse a repository and insert it in Neo4j.
 8 | Under the hood all drillers uses PyDriller to extract data from a repository.
 9 | 
10 | Drillers perform the following activities.
11 | Given a config file, they:
12 | 
13 | * establish a connection to Neo4j (or raise an exception if the connection fails),
14 | * parse the data from PyDriller,
15 | * insert the data in Neo4j.
16 | 
17 | 
18 | Currently there are 3 drillers available:
19 | 
20 | * Driller - default driller that stores the data parsed from the repository in RAM memory.
21 | * CacheDriller - stores the data parsed from the repository on disk (thus saving RAM memory at the cost of more disk writes and decreased performance).
22 | * QueueDriller - stores the data parsed from a repository to a queue. Currently it supports RabbitMQ and Artemis. Please take note that two drillers must be used in case of a queue: (i) one that parses the data from Git repos and (ii) one that indexes the data in Neo4j.
23 | The queue driller is the most scalable one since it allows to have multiple instances for indexing. Thus it solves some scalability issues (e.g., PyDriller is single threaded).
24 | 
25 | In order to index the data, you will need a config file (see :ref:`CONFIGURATION`) and the
26 | following code::
27 | 
28 |     from graphrepo.drillers.drillers import Driller
29 | 
30 |     # Initialize the database indexes
31 |     try:
32 |       driller.init_db()
33 |     except Exception as exc:
34 |       print("DB already initialized")
35 | 
36 |     # configure driller
37 |     driller = Driller(config_path='path-to-yaml-config-file.yml')
38 | 
39 |     # drill (extract data and store it in Neo4j)
40 |     driller.drill_batch()
41 | 
42 |     # merge duplicate nodes
43 |     driller.merge_all()
44 | 
45 | 
46 | For a complete example, see :ref:`EXAMPLES`.
47 | 
48 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
 1 | .. _EXAMPLES:
 2 | 
 3 | ==================
 4 | Examples
 5 | ==================
 6 | 
 7 | In the project's repository there are many examples on how to
 8 | use GraphRepo to index and mine data.
 9 | 
10 | Please note that in order to run the plotting examples you have to install ``pandas`` and ``plotly``, for example using pip::
11 | 
12 |     $ pip install pandas
13 | 
14 | 1. Index data
15 | ==============
16 | 
17 | In this example, we index all data from PyDriller in Neo4j.
18 | The example assumes you are running a Neo4j instance in Docker, as indicated in :ref:`CONFIGURATION`.
19 | 
20 | In order to run the example, clone the projects using the following commands::
21 | 
22 |     $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo
23 |     $ cd graphrepo
24 |     $ mkdir repos
25 |     $ cd repos
26 |     $ git clone https://github.com/ishepard/pydriller
27 | 
28 | In this step we cloned the GraphRepo project, which includes the example scripts to run
29 | and the PyDriller project, which we want to experiment with.
30 | 
31 | In order to run the indexing example, make sure to configure the config file in ``examples/configs/pydriller.yml``
32 | and set the ``neo`` object to your database settings.
33 | 
34 | Then run::
35 | 
36 |     $ python -m examples.index_all --config=examples/config/pydriller.yml
37 | 
38 | After indexing finishes, you can go to ``http://<database-url>:7474/browser/``
39 | and explore the project, with a query like: ``MATCH (n) RETURN n``.
40 | 
41 | 
42 | 2. Retrieve all data
43 | =====================
44 | 
45 | This step assumes you already indexed the PyDriller repository
46 | in Neo4j, as indicated at Step 1.
47 | In order to retrieve all information for PyDriller, we can run
48 | the following example::
49 | 
50 |     $ python -m examples.mine_all --config=examples/config/pydriller.yml
51 | 
52 | This script will print the number of nodes indexed in the database.
53 | 
54 | 
55 | 3. Plot file complexity over time
56 | ===================================
57 | 
58 | This step assumes you already indexed the PyDriller repository
59 | in Neo4j, as indicated at Step 1.
60 | In this example we will use the miners to retrieve a file and
61 | plot its complexity evolution over time.
62 | The file used is ``examples/file_complexity.py``.
63 | The complexity is stored in the ``UpdateFile`` relationship (see Schema).
64 | The ``get_change_history`` from the ``File`` miner retrieves all the ``UpdateFile``
65 | relationships that point to the file.
66 | 
67 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly,
68 | although any other libraries can be used.
69 | 
70 | In order to display the plot, run::
71 | 
72 |     $ python -m examples.file_complexity --config=examples/configs/pydriller.yml
73 | 
74 | 
75 | 
76 | 
77 | 3. Plot file methods complexity over time
78 | ==========================================
79 | 
80 | This step assumes you already indexed the PyDriller repository
81 | in Neo4j, as indicated at Step 1.
82 | In this example we will use the miners to retrieve and plot the complexity
83 | evolution over time of all methods in a file.
84 | The file used is ``examples/all_method_complexity.py``.
85 | The complexity is stored in the ``UpdateFile`` relationship (see Data Structure).
86 | We first get all the methods for a file, then, for each method, we get the
87 | update information as in Step 2.
88 | 
89 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly,
90 | although any other libraries can be used.
91 | 
92 | In order to display the plot, run::
93 | 
94 |     $ python -m examples.all_method_complexity --config=examples/configs/pydriller.yml
95 | 
96 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. GraphRepo documentation master file, created by
 2 |    sphinx-quickstart on Wed Jun  3 13:16:41 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | GraphRepo documentation
 7 | =====================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 | 
12 |    installation
13 |    configuration
14 |    architecture
15 |    data_structure
16 |    driller
17 |    miners
18 |    examples
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _INSTALLATION:
 2 | 
 3 | ========================
 4 | Overview & Installation
 5 | ========================
 6 | 
 7 | GraphRepo is a tool that indexes Git repositories in Neo4j, and allows to query and aggregate the data.
 8 | Under the hood it uses `PyDriller <https://github.com/ishepard/pydriller>`_ to parse the data from a repository.
 9 | 
10 | Requirements
11 | ============
12 | 
13 | * Python 3.4 (or newer)
14 | * Neo4j 3
15 | * Docker (Optional) - we recommend to use Docker for Neo4j (as indicated below)
16 | 
17 | Installation - using pip
18 | =========================
19 | 
20 | Assuming python and pip are installed, use:
21 | 
22 | .. sourcecode:: none
23 | 
24 |     $ pip install graphrepo
25 | 
26 | 
27 | Installation - clone source code (dev version)
28 | ===============================================
29 | 
30 | The latest development version can be cloned from Github::
31 | 
32 |     $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo
33 |     $ cd graphrepo
34 | 
35 | 
36 | Install the requirements:
37 | 
38 | .. sourcecode:: none
39 | 
40 |     $ pip install -r requirements.txt
41 | 
42 | Run a docker instance with Neo4j::
43 | 
44 |     $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins  -e NEO4JLABS_PLUGINS=\[\"apoc\"\]   -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11
45 | 
46 | Run the tests::
47 | 
48 | $ pytest
49 | 
50 | 
51 | Or see the :ref:`EXAMPLES`.


--------------------------------------------------------------------------------
/docs/source/mappers.rst:
--------------------------------------------------------------------------------
1 | .. _MAPPERS:
2 | 
3 | ==================
4 | Mappers
5 | ==================
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/source/miners.rst:
--------------------------------------------------------------------------------
 1 | .. _MINERS:
 2 | 
 3 | ==================
 4 | Miners
 5 | ==================
 6 | 
 7 | Miners are special classes which hold default Neo4j queries that can be used to extract data.
 8 | At the moment, there are 4 standard miners, specific to the most important node entities in the graph:
 9 | 
10 | * ``CommitMiner`` - default queries for commits (including relationships to other nodes),
11 | * ``DeveloperMiner`` - default queries for developers (including relationships to other nodes),
12 | * ``FileMiner`` - default queries for files (including relationships to other nodes),
13 | * ``MethodMiner`` - default queries for methods (including relationships to other nodes),
14 | 
15 | and a ``MineManager``, which initializes and configures all miners.
16 | 
17 | We recommend to always use the ``MineManager`` for initialization, since there is no overhead over initializing only one miner.
18 | Using a config file (see :ref:`CONFIGURATION`), the ``Minemanager`` can be initialized as follows::
19 | 
20 |     from graphrepo.miners import MineManager
21 | 
22 |     # initialize mine manager
23 |     miner = MineManager(config_path=args.config)
24 | 
25 |     # The specific miners can now be accessed as:
26 |     miner.commit_miner.get_all()
27 | 
28 |     miner.dev_miner.get_all()
29 | 
30 |     miner.file_miner.get_all()
31 | 
32 |     miner.method_miner.get_all()


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/examples/__init__.py


--------------------------------------------------------------------------------
/examples/all_method_complexity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module plots the method complexity evolution over time, for a file"""
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import pandas as pd
21 | import plotly.express as px
22 | 
23 | from graphrepo.miners import MineManager
24 | 
25 | from datetime import datetime
26 | 
27 | 
28 | def parse_args():
29 |     """Parse args"""
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         '--config', default='examples/configs/pydriller.yml', type=str)
33 |     parser.add_argument('--plot', default=False, type=bool)
34 |     return parser.parse_args()
35 | 
36 | 
37 | def main():
38 |     """Main"""
39 |     args = parse_args()
40 | 
41 |     file_query = {
42 |         'hash': 'e2eb7bf414cebe68f46fa88e4abe9ae5813e91c4e1e97570f8e41cf4'}
43 | 
44 |     start = datetime.now()
45 |     mine_manager = MineManager(config_path=args.config)
46 | 
47 |     methods = mine_manager.file_miner.get_current_methods(file_query['hash'])
48 | 
49 |     m_changes = []
50 |     for m in methods:
51 |         changes = mine_manager.method_miner.get_change_history(m)
52 |         mc = [{'complexity': x['complexity'],
53 |                'date':  datetime.fromtimestamp(x['timestamp']),
54 |                'name': m['name']} for x in changes]
55 |         m_changes = m_changes + mc
56 |     print('All methods complexity took: {}'.format(datetime.now() - start))
57 |     print('Total methods: ', len(methods))
58 | 
59 |     if args.plot:
60 |         df = pd.DataFrame(m_changes)
61 |         df['date'] = pd.to_datetime(df.date)
62 |         df = df.sort_values(by='date')
63 |         fig = px.line(df, x="date", y="complexity", color="name",
64 |                       line_group="name", hover_name="name")
65 |         fig.show()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/examples/benchmarks/all_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import os
17 | import yaml
18 | from graphrepo.miners import MineManager
19 | from datetime import datetime
20 | 
21 | 
22 | def parse_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
25 |     return parser.parse_args()
26 | 
27 | 
28 | def main():
29 |     args = parse_args()
30 | 
31 |     start = datetime.now()
32 |     miner = MineManager(config_path=args.config)
33 | 
34 |     # get all nodes and relationships from the manager
35 |     nodes, rels = miner.get_all_data(map=False, merge=False)
36 |     print("The DB has a total of {} nodes and {} relationships".format(
37 |         len(nodes), len(rels)))
38 |     print("All data took: {}".format(datetime.now() - start))
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/examples/benchmarks/all_methods_complexity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 | 
24 | from graphrepo.miners import MineManager
25 | from graphrepo.utils import parse_config
26 | 
27 | from datetime import datetime
28 | 
29 | 
30 | def parse_args():
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
33 |     parser.add_argument('--plot', default=False, type=bool)
34 |     return parser.parse_args()
35 | 
36 | 
37 | def main():
38 |     args = parse_args()
39 | 
40 |     if 'jax' in args.config:
41 |         file_query = {
42 |             'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'}
43 |     if 'hadoop' in args.config:
44 |         file_query = {
45 |             'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'}
46 |     if 'kibana' in args.config:
47 |         file_query = {
48 |             'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48'
49 |         }
50 |     if 'tensorflow' in args.config:
51 |         file_query = {
52 |             'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73'
53 |         }
54 | 
55 |     start = datetime.now()
56 |     mine_manager = MineManager(config_path=args.config)
57 |     methods = mine_manager.file_miner.get_current_methods(file_query['hash'])
58 | 
59 |     m_changes = []
60 |     for m in methods:
61 |         changes = mine_manager.method_miner.get_change_history(m['hash'])
62 |         mc = [{'complexity': x['complexity'],
63 |                'date':  datetime.fromtimestamp(x['timestamp']),
64 |                'name': m['name']} for x in changes]
65 |         m_changes = m_changes + mc
66 | 
67 |     print('All methods complexity took: {}'.format(datetime.now() - start))
68 |     print('Total methods: ', len(methods))
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/examples/benchmarks/dev_files.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 | 
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 | 
28 | 
29 | def parse_args():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 |     return parser.parse_args()
33 | 
34 | 
35 | def main():
36 |     args = parse_args()
37 | 
38 |     if 'jax' in args.config:
39 |         dev_query = {
40 |             'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'}
41 |     if 'hadoop' in args.config:
42 |         dev_query = {
43 |             'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'}
44 |     if 'kibana' in args.config:
45 |         dev_query = {
46 |             'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'}
47 | 
48 |     if 'tensorflow' in args.config:
49 |         dev_query = {
50 |             'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'}
51 | 
52 |     start = datetime.now()
53 |     mine_manager = MineManager(config_path=args.config)
54 |     files = mine_manager.dev_miner.get_files(
55 |         dev_query['hash'],
56 |         mine_manager.config.ct.project_id
57 |     )
58 |     ft = [f['type'] for f in files]
59 |     grouped = [{'file': x, 'count': len(
60 |         [y for y in ft if x == y])} for x in set(ft)]
61 | 
62 |     print('Dev file types took {}'.format(datetime.now() - start))
63 |     print('Nr files', len(ft))
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/examples/benchmarks/dev_methods.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | 
21 | from datetime import datetime
22 | from graphrepo.miners import MineManager
23 | 
24 | 
25 | def parse_args():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
28 |     return parser.parse_args()
29 | 
30 | 
31 | def main():
32 |     args = parse_args()
33 | 
34 |     if 'jax' in args.config:
35 |         dev_query = {
36 |             'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'}
37 |     if 'hadoop' in args.config:
38 |         dev_query = {
39 |             'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'}
40 |     if 'kibana' in args.config:
41 |         dev_query = {
42 |             'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'}
43 |     if 'tensorflow' in args.config:
44 |         dev_query = {
45 |             'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'}
46 | 
47 |     start = datetime.now()
48 |     mine_manager = MineManager(config_path=args.config)
49 |     method_updates = mine_manager.dev_miner.get_method_updates(
50 |         dev_query['hash'],
51 |         mine_manager.config.ct.project_id
52 |     )
53 |     complexity = [c['complexity']
54 |                   for c in method_updates if c['complexity'] != -1]
55 |     _ = sum(complexity) / len(complexity)
56 | 
57 |     print('Dev file types took {}'.format(datetime.now() - start))
58 |     print('Nr method updates', len(method_updates))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/examples/benchmarks/file_nloc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 | 
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 | 
28 | 
29 | def parse_args():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 |     parser.add_argument('--plot', default=False, type=bool)
33 |     return parser.parse_args()
34 | 
35 | 
36 | def main():
37 |     args = parse_args()
38 | 
39 |     if 'jax' in args.config:
40 |         file_query = {
41 |             'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'}
42 |     if 'hadoop' in args.config:
43 |         file_query = {
44 |             'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'}
45 | 
46 |     if 'kibana' in args.config:
47 |         file_query = {
48 |             'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48'
49 |         }
50 |     if 'tensorflow' in args.config:
51 |         file_query = {
52 |             'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73'
53 |         }
54 | 
55 |     start = datetime.now()
56 | 
57 |     mine_manager = MineManager(config_path=args.config)
58 |     updated_file_rels = mine_manager.file_miner.get_change_history(
59 |         file_hash=file_query['hash'])
60 |     nloc = [x['nloc'] for x in updated_file_rels]
61 | 
62 |     print('File nloc took {}'.format(datetime.now() - start))
63 |     print('File changes', len(updated_file_rels))
64 |     # print(updated_file_rels.data)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/examples/configs/graphrepo.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | neo:
 3 |   db_url: localhost
 4 |   port: 7687
 5 |   db_user: neo4j
 6 |   db_pwd: neo4jj
 7 |   batch_size: 50
 8 | 
 9 | project:
10 |   repo: repos/GraphRepo/
11 |   start_date: #"1 February, 2018"
12 |   end_date: #"30 March, 2018"
13 |   project_id: 'graphrepo'
14 |   index_code: False
15 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/configs/grepo-test.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | neo:
 3 |   db_url: localhost
 4 |   port: 7687
 5 |   db_user: neo4j
 6 |   db_pwd: neo4jj
 7 |   batch_size: 50
 8 | 
 9 | project:
10 |   repo: repos/gr-testbench/
11 |   start_date: #"1 February, 2018"
12 |   end_date: #"30 March, 2018"
13 |   project_id: 'graphrepo-testbench'
14 |   index_code: False
15 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/configs/hadoop.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 50
 7 | 
 8 | project:
 9 |   repo: repos/hadoop/
10 |   start_date: "1 January, 2017 00:00"
11 |   end_date: "1 January, 2018 00:00"
12 |   project_id: hadoop
13 |   index_code: True
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/configs/jax.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 50
 7 | 
 8 | project:
 9 |   repo: repos/jax/
10 |   start_date: "1 January, 2019 00:00"
11 |   end_date: "1 May, 2020 00:00"
12 |   project_id: jax
13 |   index_code: True
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/configs/kibana.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 50
 7 | 
 8 | project:
 9 |   repo: repos/kibana/
10 |   start_date: "1 June, 2018 00:00"
11 |   end_date: "1 June, 2019 00:00"
12 |   project_id: kibana
13 |   index_code: True
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/configs/pydriller.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 50
 7 | 
 8 | project:
 9 |   repo: repos/pydriller/
10 |   start_date: #"1 February, 2018"
11 |   end_date: #"30 March, 2018"
12 |   project_id: 'pydriller'
13 |   index_code: False
14 |   index_developer_email: True
15 | 


--------------------------------------------------------------------------------
/examples/configs/tensorflow.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 50
 7 | 
 8 | project:
 9 |   repo: repos/tensorflow/
10 |   start_date: "1 January, 2020 00:00"
11 |   end_date: "1 March, 2020 00:00"
12 |   project_id: 'tensorflow'
13 |   index_code: True
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/examples/dev_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 | 
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 | 
28 | 
29 | def parse_args():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         '--config', default='examples/configs/pydriller.yml', type=str)
33 |     return parser.parse_args()
34 | 
35 | 
36 | def main():
37 |     args = parse_args()
38 |     mine_manager = MineManager(config_path=args.config)
39 |     files = mine_manager.dev_miner.get_files(
40 |         "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
41 |         mine_manager.config.ct.project_id
42 |     )
43 |     print(len(files), ' files')
44 | 
45 |     file_updates = mine_manager.dev_miner.get_files_updates(
46 |         "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
47 |         mine_manager.config.ct.project_id
48 |     )
49 |     print(len(file_updates), ' file updates')
50 | 
51 |     methods = mine_manager.dev_miner.get_methods(
52 |         "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
53 |         mine_manager.config.ct.project_id
54 |     )
55 |     print(len(methods), ' methods')
56 | 
57 |     method_updates = mine_manager.dev_miner.get_method_updates(
58 |         "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
59 |         mine_manager.config.ct.project_id
60 |     )
61 |     print(len(method_updates), ' method updates')
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/examples/file_complexity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 | 
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 | 
28 | 
29 | def parse_args():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 |     return parser.parse_args()
33 | 
34 | 
35 | def main():
36 |     args = parse_args()
37 |     mine_manager = MineManager(config_path=args.config)
38 | 
39 |     file_miner = mine_manager.file_miner
40 |     file_ = file_miner.query(pproject_id=mine_manager.config.ct.project_id,
41 |                              name="commit.py")
42 |     updated_file_rels = file_miner.get_change_history(file_['hash'])
43 | 
44 |     # sort update relationships and transform data for plotting
45 |     updated_file_rels.sort(key=lambda x: x['timestamp'])
46 | 
47 |     complexity = [x['complexity'] for x in updated_file_rels]
48 |     nloc = [x['nloc'] for x in updated_file_rels]
49 |     dts = [datetime.fromtimestamp(x['timestamp']) for x in updated_file_rels]
50 | 
51 |     fig = px.line(pd.DataFrame({'date': dts, 'complexity': complexity}),
52 |                   x='date', y='complexity',
53 |                   title='Complexity over time for the commit.py file')
54 |     fig.show()
55 | 
56 |     fig_2 = px.line(pd.DataFrame({'date': dts, 'nloc': nloc}),
57 |                     x='date', y='nloc', title="NLOC over time for the commit.py file")
58 |     fig_2.show()
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/examples/index_all.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is an example of indexing all data from a repository in Neo4j"""
15 | 
16 | import argparse
17 | from graphrepo.drillers import Driller
18 | 
19 | 
20 | def parse_args():
21 |     """Parse argument"""
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument(
24 |         '--config', default='examples/configs/pydriller.yml', type=str)
25 |     return parser.parse_args()
26 | 
27 | 
28 | def main():
29 |     """Main method"""
30 |     args = parse_args()
31 |     driller = Driller(config_path=args.config)
32 |     # this method should be called only once, when initializing
33 |     # a database for the first time
34 |     try:
35 |       driller.init_db()
36 |     except Exception as exc:
37 |       print("DB already initialized")
38 |     driller.drill_batch()
39 |     driller.merge_all()
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/examples/mine_all.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import os
17 | import yaml
18 | from graphrepo.miners import MineManager
19 | from datetime import datetime
20 | 
21 | 
22 | def parse_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument(
25 |         '--config', default='examples/configs/pydriller.yml', type=str)
26 |     return parser.parse_args()
27 | 
28 | 
29 | def main():
30 |     args = parse_args()
31 | 
32 |     start = datetime.now()
33 |     miner = MineManager(config_path=args.config)
34 | 
35 |     # get all nodes and relationships from the manager
36 |     nodes, rels = miner.get_all_data()
37 |     print("The DB has a total of {} nodes and {} relationships".format(
38 |         len(nodes), len(rels)))
39 |     print("All data took: {}".format(datetime.now() - start))
40 | 
41 |     # get all commits
42 |     commits = miner.commit_miner.get_all()
43 |     print("The DB has a total of {} commits".format(len(commits)))
44 | 
45 |     # get all developers
46 |     devs = miner.dev_miner.get_all()
47 |     print("The DB has a total of {} developers".format(len(devs)))
48 | 
49 |     # get all files
50 |     files = miner.file_miner.get_all()
51 |     print("The DB has a total of {} files".format(len(files)))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/graphrepo/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/graphrepo/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """This module stores all config constants. It is a singleton
16 | because it is used across several modules inside the app"""
17 | 
18 | from graphrepo.singleton import Singleton
19 | from graphrepo.utils import Dotdict
20 | 
21 | 
22 | class Config(metaclass=Singleton):
23 |     """This class contains all config flags"""
24 |     ct = {}
25 | 
26 |     def configure(self, **kwargs):
27 |         """Stores configuration contants, parsed
28 |         from yaml config file
29 |         :param kwargs: keys and values from config
30 |         """
31 |         self.ct = Dotdict(kwargs)
32 | 
33 |     def check_config(self):
34 |         """Checks if the config properties are set and
35 |         raises ValueError if any value misses"""
36 | 
37 |         if not self.ct.db_url or not self.ct.port \
38 |                 or not self.ct.db_user or not self.ct.db_pwd:
39 |             raise ValueError("Neo4j configuartion is invalid.")
40 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/__init__.py:
--------------------------------------------------------------------------------
1 | from .driller import *
2 | from .cache_driller import *
3 | from .stomp_driller import *
4 | from .queue_driller import *
5 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/batch_utils.py:
--------------------------------------------------------------------------------
  1 | """This module is the wild wild west of batch indexing :-)
  2 | In contains all Neo4j queries for indexing the data in batches.
  3 | More documentation will follow soon.
  4 | """
  5 | from datetime import datetime
  6 | 
  7 | 
  8 | def batch(iterable, n=1):
  9 |     l = len(iterable)
 10 |     for ndx in range(0, l, n):
 11 |         yield iterable[ndx:min(ndx + n, l)]
 12 | 
 13 | 
 14 | def index_commits(graph, commits, batch_size=100):
 15 |     query = """
 16 |     UNWIND {commits} AS c
 17 |     MERGE (nc :Commit { hash: c.hash})
 18 |       ON CREATE SET
 19 |                 nc = c
 20 |       ON MATCH SET
 21 |                 nc = c
 22 |     """
 23 |     for b in batch(commits, batch_size):
 24 |         graph.run(query, commits=b)
 25 | 
 26 | 
 27 | def index_parent_commits(graph, parents, batch_size=100):
 28 |     query = """
 29 |     UNWIND {ac} AS a
 30 |     MATCH (x:Commit),(y:Commit)
 31 |     WHERE x.hash = a.parent_hash AND y.hash = a.child_hash
 32 |     MERGE (x)-[r:Parent{}]->(y)
 33 |     """
 34 |     for b in batch(parents, batch_size):
 35 |         graph.run(query, ac=b)
 36 | 
 37 | 
 38 | def index_authors(graph, authors, batch_size=100):
 39 |     query = """
 40 |     UNWIND {authors} AS a
 41 |     MERGE (nd:Developer { hash: a.hash})
 42 |       ON CREATE SET nd = a
 43 |       ON MATCH SET nd = a
 44 |     """
 45 |     for b in batch(authors, batch_size):
 46 |         graph.run(query, authors=b)
 47 | 
 48 | 
 49 | def index_branches(graph, branches, batch_size=100):
 50 |     query = """
 51 |     UNWIND {branches} AS a
 52 |     MERGE (nb:Branch { hash: a.hash})
 53 |       ON CREATE SET nb = a
 54 |       ON MATCH SET nb = a
 55 |     """
 56 |     for b in batch(branches, batch_size):
 57 |         graph.run(query, branches=b)
 58 | 
 59 | 
 60 | def index_branch_commits(graph, bc, batch_size=100):
 61 |     query = """
 62 |     UNWIND {ac} AS a
 63 |     MATCH (x:Branch),(y:Commit)
 64 |     WHERE x.hash = a.branch_hash AND y.hash = a.commit_hash
 65 |     MERGE (x)-[r:BranchCommit{}]->(y)
 66 |     """
 67 |     for b in batch(bc, batch_size):
 68 |         graph.run(query, ac=b)
 69 | 
 70 | 
 71 | def index_files(graph, files, batch_size=100):
 72 |     query = """
 73 |     UNWIND {files} AS f
 74 |     MERGE (nf:File { hash: f.hash})
 75 |       ON CREATE SET nf = f
 76 |       ON MATCH SET nf = f
 77 |     """
 78 |     for b in batch(files, batch_size):
 79 |         graph.run(query, files=b)
 80 | 
 81 | 
 82 | def index_methods(graph, methods, batch_size=100):
 83 |     query = """
 84 |     UNWIND {methods} AS f
 85 |     MERGE (nm:Method { hash: f.hash})
 86 |       ON CREATE SET nm = f
 87 |       ON MATCH SET nm = f
 88 |     """
 89 | 
 90 |     for b in batch(methods, batch_size):
 91 |         graph.run(query, methods=b)
 92 | 
 93 | 
 94 | def index_author_commits(graph, ac, batch_size=100):
 95 |     query = """
 96 |     UNWIND {ac} AS a
 97 |     MATCH (x:Developer),(y:Commit)
 98 |     WHERE x.hash = a.author_hash AND y.hash = a.commit_hash
 99 |     MERGE (x)-[r:Author{timestamp: a.timestamp}]->(y)
100 |     """
101 |     for b in batch(ac, batch_size):
102 |         graph.run(query, ac=b)
103 | 
104 | 
105 | def index_commit_files(graph, cf, batch_size=100):
106 |     query = """
107 |     UNWIND {cf} AS a
108 |     MATCH (x:Commit),(y:File)
109 |     WHERE x.hash = a.commit_hash AND y.hash = a.file_hash
110 |     MERGE (x)-[r:UpdateFile{}]->(y)
111 |     ON CREATE SET r=a['attributes']
112 |     """
113 |     for i, b in enumerate(batch(cf, batch_size)):
114 |         graph.run(query, cf=b)
115 | 
116 | 
117 | def index_file_methods(graph, cf, batch_size=100):
118 |     query = """
119 |     UNWIND {cf} AS a
120 |     MATCH (x:File),(y:Method)
121 |     WHERE x.hash = a.file_hash AND y.hash = a.method_hash
122 |     MERGE (x)-[r:Method{}]->(y)
123 |     """
124 |     for b in batch(cf, batch_size):
125 |         graph.run(query, cf=b)
126 | 
127 | 
128 | def index_commit_method(graph, cm, batch_size=100):
129 |     query = """
130 |     UNWIND {cf} AS a
131 |     MATCH (x:Commit),(y:Method)
132 |     WHERE x.hash = a.commit_hash AND y.hash = a.method_hash
133 |     MERGE (x)-[r:UpdateMethod]->(y)
134 |     ON CREATE SET r=a['attributes']
135 |     """
136 |     for i, b in enumerate(batch(cm, batch_size)):
137 |         graph.run(query, cf=b)
138 | 
139 | 
140 | def create_index_authors(graph):
141 |     query = """
142 |     CREATE INDEX ON :Developer(hash)
143 |     """
144 |     graph.run(query)
145 | 
146 | 
147 | def create_index_commits(graph, hash=True):
148 |     if hash:
149 |         hash_q = """
150 |         CREATE INDEX ON :Commit(hash)
151 |         """
152 |         graph.run(hash_q)
153 | 
154 |     pid_q = """
155 |     CREATE INDEX ON :Commit(project_id)
156 |     """
157 | 
158 |     graph.run(pid_q)
159 | 
160 | 
161 | def create_index_branches(graph, hash=True):
162 |     if hash:
163 |         hash_q = """
164 |       CREATE INDEX ON :Branch(hash)
165 |       """
166 |         graph.run(hash_q)
167 | 
168 |     pid_q = """
169 |     CREATE INDEX ON :Branch(project_id)
170 |     """
171 |     graph.run(pid_q)
172 | 
173 | 
174 | def create_index_files(graph, hash=True):
175 |     if hash:
176 |         hash_q = """
177 |         CREATE INDEX ON :File(hash)
178 |         """
179 |         graph.run(hash_q)
180 | 
181 |     mhash_q = """
182 |     CREATE INDEX ON :File(merge_hash)
183 |     """
184 |     graph.run(mhash_q)
185 | 
186 |     pid_q = """
187 |     CREATE INDEX ON :File(project_id)
188 |     """
189 |     graph.run(pid_q)
190 | 
191 | 
192 | def create_index_methods(graph, hash=True):
193 |     if hash:
194 |         hash_q = """
195 |         CREATE INDEX ON :Method(hash)
196 |         """
197 |         graph.run(hash_q)
198 | 
199 |     mhash_q = """
200 |     CREATE INDEX ON :Method(merge_hash)
201 |     """
202 |     graph.run(mhash_q)
203 | 
204 |     pid_q = """
205 |     CREATE INDEX ON :Method(project_id)
206 |     """
207 |     graph.run(pid_q)
208 | 
209 | 
210 | def merge_renamed_files(graph, project_id):
211 |   query = """
212 |     MATCH (n1:File),(n2:File)
213 |     WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.merge_hash  and id(n1) < id(n2)
214 |     WITH [n1,n2] as ns
215 |     order by id(ns[1]) desc
216 |     CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
217 |     MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf
218 |     with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows
219 |     unwind allRows as row
220 |     match (mu: Method {{hash: row.hash}})
221 |     SET mu.merge_hash = row.new_hash""".format(project_id)
222 |   graph.run(query)
223 | 
224 | def merge_new_files(graph, project_id):
225 |   query = """
226 |     MATCH (n1:File),(n2:File)
227 |     WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.hash and id(n1) < id(n2)
228 |     WITH [n1,n2] as ns
229 |     order by id(ns[1]) desc
230 |     CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
231 |     MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf
232 |     with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows
233 |     unwind allRows as row
234 |     match (mu: Method {{hash: row.hash}})
235 |     SET mu.merge_hash = row.new_hash
236 |     """.format(project_id)
237 |   graph.run(query)
238 | 
239 | 
240 | def merge_methods(graph, project_id):
241 |   query = """
242 |   MATCH (n1:Method),(n2:Method)
243 |   WHERE n1.project_id = "{0}" and n2.project_id = "{0}"
244 |   and n1.file_name = n2.file_name and n1.name = n2.name and n1.project_id = n2.project_id and n1.merge_hash = n2.merge_hash and id(n1) < id(n2)
245 |   WITH [n1,n2] as ns
246 |   order by id(ns[1]) desc
247 |   CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
248 |   return node
249 |   """.format(project_id)
250 |   graph.run(query)
251 | 
252 | 
253 | def merge_files(graph, config):
254 |   print('Merging moved files and methods')
255 |   start = datetime.now()
256 |   merge_renamed_files(graph, config.project_id)
257 |   merge_methods(graph, config.project_id)
258 |   merge_new_files(graph, config.project_id)
259 |   merge_methods(graph, config.project_id)
260 |   print('Merged files and methods \t', datetime.now()-start)
261 | 
262 | def index_all(graph, developers, commits, parents, dev_commits, branches,
263 |               branches_commits, files, commit_files, methods, file_methods,
264 |               commit_methods, config):
265 | 
266 |     total = datetime.now()
267 | 
268 |     batch_size = config.batch_size
269 | 
270 |     developers = list({v['hash']: v for v in developers}.values())
271 |     print('Indexing ', len(developers), ' authors')
272 |     start = datetime.now()
273 |     index_authors(graph, developers, batch_size)
274 |     print('Indexed authors in: \t', datetime.now()-start)
275 | 
276 |     print('Indexing ', len(commits), ' commits')
277 |     start = datetime.now()
278 |     index_commits(graph, commits, batch_size)
279 |     print('Indexed commits in: \t', datetime.now()-start)
280 | 
281 |     branches = list({v['hash']: v for v in branches}.values())
282 |     branches_commits = list({str(i): i for i in branches_commits}.values())
283 |     print('Indexing ', len(branches), ' branches')
284 |     start = datetime.now()
285 |     index_branches(graph, branches, batch_size)
286 |     index_branch_commits(graph, branches_commits, batch_size)
287 |     print('Indexed branches in: \t', datetime.now()-start)
288 | 
289 |     files = list({v['hash']: v for v in files}.values())
290 |     print('Indexing ', len(files), ' files')
291 |     start = datetime.now()
292 |     index_files(graph, files, batch_size)
293 |     print('Indexed files in: \t', datetime.now()-start)
294 | 
295 |     methods = list({v['hash']: v for v in methods}.values())
296 |     print('Indexing ', len(methods), ' methods')
297 |     start = datetime.now()
298 |     index_methods(graph, methods, batch_size)
299 |     print('Indexed methods in: \t', datetime.now()-start)
300 | 
301 |     parents = list({str(i): i for i in parents}.values())
302 |     print('Indexing ', len(parents), ' parent commits')
303 |     start = datetime.now()
304 |     index_parent_commits(graph, parents, batch_size)
305 |     print('Indexed commits in: \t', datetime.now()-start)
306 | 
307 |     print('Indexing ', len(dev_commits), ' author_commits')
308 |     start = datetime.now()
309 |     index_author_commits(graph, dev_commits, batch_size)
310 |     print('Indexed author_commits in: \t', datetime.now()-start)
311 | 
312 |     file_methods = list({str(i): i for i in file_methods}.values())
313 |     print('Indexings ', len(file_methods), ' file_methods')
314 |     start = datetime.now()
315 |     index_file_methods(graph, file_methods, batch_size)
316 |     print('Indexed file_methods in: \t', datetime.now()-start)
317 | 
318 |     print('Indexing ', len(commit_methods), ' commit_methods')
319 |     start = datetime.now()
320 |     index_commit_method(graph, commit_methods, batch_size)
321 |     print('Indexed commit_methods in: \t', datetime.now()-start)
322 | 
323 |     print('Indexing ', len(commit_files), ' commit_files')
324 |     start = datetime.now()
325 |     index_commit_files(graph, commit_files, batch_size)
326 |     print('Indexed commit_files in: \t', datetime.now()-start)
327 |     print('Indexing took: \t', datetime.now()-total)
328 | 
329 | 
330 | def index_cache(graph, cache, config):
331 |     batch_size = config.batch_size
332 |     total = datetime.now()
333 |     index_authors(graph, list(
334 |         {v['hash']: v for v in cache.data['developers']}.values()), batch_size)
335 |     index_commits(graph, cache.data['commits'], batch_size)
336 |     index_branches(graph, list(
337 |         {v['hash']: v for v in cache.data['branches']}.values()), batch_size)
338 |     index_branch_commits(graph,  list(
339 |         {str(i): i for i in cache.data['branches_commits']}.values()), batch_size)
340 |     index_files(graph, list(
341 |         {v['hash']: v for v in cache.data['files']}.values()), batch_size)
342 |     index_methods(graph, list(
343 |         {v['hash']: v for v in cache.data['methods']}.values()), batch_size)
344 |     index_parent_commits(graph, list(
345 |         {str(i): i for i in cache.data['parents']}.values()), batch_size)
346 |     index_author_commits(graph, cache.data['dev_commits'], batch_size)
347 |     index_file_methods(graph, list(
348 |         {str(i): i for i in cache.data['file_methods']}.values()), batch_size)
349 |     index_commit_method(graph, cache.data['commit_methods'], batch_size)
350 |     index_commit_files(graph, cache.data['commit_files'], batch_size)
351 |     print('Indexing took: \t', datetime.now()-total)
352 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/cache_driller.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 GraphRepo
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """ This module uses pydriller to search a repository
 16 | and indexes it in neo4j
 17 | """
 18 | from datetime import datetime
 19 | from pydriller import RepositoryMining
 20 | 
 21 | import graphrepo.utils as utl
 22 | import graphrepo.drillers.batch_utils as b_utl
 23 | from graphrepo.drillers.drill_cache import DrillCache, DrillCacheSequential
 24 | from graphrepo.drillers.default import DefaultDriller
 25 | from graphrepo.logger import Logger
 26 | 
 27 | LG = Logger()
 28 | 
 29 | 
 30 | class CacheDriller(DefaultDriller):
 31 |     """CacheDriller class - parses a git repo and uses the models
 32 |     to index everything in Neo4j by storing all data on disk.
 33 |     """
 34 | 
 35 |     def drill_batch_cache_sequential(self, index=True):
 36 |         """Extracts all information from a git repository
 37 |         and it stores in in a disk cache
 38 |         :param index: optional, if True, the data is indexed in Neo4j
 39 |         :returns: cache with all data
 40 |         """
 41 |         start = datetime.now()
 42 |         print('Driller started at: \t', start)
 43 |         cache = DrillCacheSequential()
 44 |         for commit in \
 45 |             RepositoryMining(self.config.ct.repo,
 46 |                              since=self.config.ct.start_date,
 47 |                              to=self.config.ct.end_date).traverse_commits():
 48 |             timestamp = commit.author_date.timestamp()
 49 |             dev = utl.format_dev(commit, self.config.ct.index_developer_email)
 50 |             cache.append_cache('developers', dev)
 51 |             com = utl.format_commit(commit, self.config.ct.project_id)
 52 |             cache.append_cache('commits', com)
 53 |             cache.append_cache(
 54 |                 'dev_commits',
 55 |                 utl.format_author_commit(dev, com, timestamp))
 56 |             for parent in commit.parents:
 57 |                 cache.append_cache('parents', utl.format_parent_commit(
 58 |                     com['hash'], parent, self.config.ct.project_id))
 59 |             for branch in commit.branches:
 60 |                 br_ = utl.format_branch(branch, self.config.ct.project_id)
 61 |                 cache.append_cache('branches', br_)
 62 |                 cache.append_cache('branches_commits', utl.format_branch_commit(
 63 |                     br_['hash'], com['hash']))
 64 |             for file in commit.modifications:
 65 |                 fl_ = utl.format_file(file, self.config.ct.project_id)
 66 |                 cache.append_cache('files', fl_)
 67 |                 cache.append_cache('commit_files', utl.format_commit_file(
 68 |                     com['hash'], file, timestamp, self.config.ct.project_id))
 69 |                 for method in file.changed_methods:
 70 |                     met = utl.format_method(
 71 |                         method, file, self.config.ct.project_id)
 72 |                     cache.append_cache('methods', met)
 73 |                     cache.append_cache(
 74 |                         'file_methods',
 75 |                         utl.format_file_method(fl_['hash'],
 76 |                                                met['hash']))
 77 |                     cache.append_cache('commit_methods',
 78 |                                        utl.format_commit_method(
 79 |                                            com['hash'],
 80 |                                            met['hash'],
 81 |                                            method,
 82 |                                            timestamp))
 83 |         print('Driller finished in: \t', datetime.now() - start)
 84 |         if index:
 85 |             self.index_batch(cache)
 86 |         return cache
 87 | 
 88 |     def index_batch(self, cache):
 89 |         """Indexes cached data to Neo4j
 90 |         :param cache: diskcache Cache or Index
 91 |         """
 92 |         try:
 93 |             self.config.check_config()
 94 |             self._check_connection()
 95 |             b_utl.index_cache(
 96 |                 self.graph, cache, config=self.config.ct)
 97 |         except Exception as exc:
 98 |             LG.log_and_raise(exc)
 99 |         else:
100 |             return
101 | 
102 |     def drill_batch_cache_all(self, index=True):
103 |         """Extracts the information from a repository in memory
104 |         and caches it after the extraction
105 |         :param index: optional, if True, the data is indexed in Neo4j
106 |         """
107 |         data = self.drill_batch(index=False)
108 |         cache = DrillCache(data)
109 |         if index:
110 |             self.index_batch(cache)
111 |         return cache
112 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/db_init.py:
--------------------------------------------------------------------------------
 1 | """This module initializes the Neo4j indexes"""
 2 | import graphrepo.drillers.batch_utils as utils
 3 | 
 4 | 
 5 | def create_hash_constraints(graph):
 6 |     """Creates uniqueness constratins on nodes' hash"""
 7 |     query = """CREATE CONSTRAINT ON (n: {}) ASSERT n.hash IS UNIQUE"""
 8 |     nodes = ["Developer", "Branch", "Commit", "File", "Method"]
 9 |     for node in nodes:
10 |         fquery = query.format(node)
11 |         graph.run(fquery)
12 | 
13 | 
14 | def create_indices(graph, hash_index=True):
15 |     """Initializes all indexes for database"""
16 |     if hash_index:
17 |         utils.create_index_authors(graph)
18 |     utils.create_index_branches(graph, hash_index)
19 |     utils.create_index_commits(graph, hash_index)
20 |     utils.create_index_files(graph, hash_index)
21 |     utils.create_index_methods(graph, hash_index)
22 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/default.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright 2021 GraphRepo
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Default Parent class for drillers
 16 | """
 17 | from abc import abstractmethod
 18 | from datetime import datetime
 19 | from py2neo import Graph
 20 | from pydriller import RepositoryMining
 21 | 
 22 | import graphrepo.utils as utl
 23 | import graphrepo.drillers.batch_utils as b_utl
 24 | import graphrepo.drillers.db_init as db_init
 25 | from graphrepo.config import Config
 26 | from graphrepo.logger import Logger
 27 | LG = Logger()
 28 | 
 29 | 
 30 | class DefaultDriller():
 31 |     """DefaultDriller class - parses a git repo and uses the models
 32 |     to index everything in Neo4j.
 33 |     """
 34 | 
 35 |     def __init__(self, config_path):
 36 |         """Initializes the properties of this class
 37 |         :param config_path: path to yml config file
 38 |         """
 39 |         try:
 40 |             if not config_path:
 41 |                 raise FileNotFoundError
 42 |             neo, project = utl.parse_config(config_path)
 43 |             self.config = Config()
 44 |             self.graph = None
 45 |             self.config.configure(**neo, **project)
 46 |             self._connect()
 47 |         except Exception as exc:
 48 |             LG.log_and_raise(exc)
 49 | 
 50 |     def _connect(self):
 51 |         """Instantiates the connection to Neo4j and stores
 52 |         the graph internally.
 53 |         Throws exception if the connection can not pe realized
 54 |         """
 55 |         try:
 56 |             self.graph = Graph(host=self.config.ct.db_url,
 57 |                                user=self.config.ct.db_user,
 58 |                                password=self.config.ct.db_pwd,
 59 |                                port=self.config.ct.port)
 60 |         except Exception as exc:
 61 |             LG.log_and_raise(exc)
 62 | 
 63 |     def _check_connection(self):
 64 |         """Checks if there is a db connection and raises
 65 |         ReferenceError if not.
 66 |         """
 67 |         try:
 68 |             self._connect()
 69 |         except:
 70 |             raise ReferenceError("There is no valid "
 71 |                                  "database connection. Please "
 72 |                                  "configure and connect first.")
 73 | 
 74 |     def init_db(self):
 75 |         """Runs initialization of a database; creates
 76 |         constraints and indexes"""
 77 |         try:
 78 |             self._check_connection()
 79 |             db_init.create_hash_constraints(self.graph)
 80 |             db_init.create_indices(self.graph, hash_index=False)
 81 |         except Exception as exc:
 82 |             raise exc
 83 | 
 84 |     def clean(self):
 85 |         """Removes all data in a graph
 86 |         """
 87 |         try:
 88 |             self.config.check_config()
 89 |             self._check_connection()
 90 | 
 91 |             self.graph.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r")
 92 |         except Exception as exc:
 93 |             LG.log_and_raise(exc)
 94 | 
 95 |     def drill_batch(self, index=True, save_path=None):
 96 |         """Extracts data from a software repository, with the option
 97 |         of saving it on diks and indexing it in Neo4j
 98 |         :param index: optional, if True, the data is indexed in Neo4j
 99 |         :param save_path: optional, if given, the data is stored on dik
100 |         :returns: dictionary with all data
101 |         """
102 |         start = datetime.now()
103 |         print('Driller started at: \t', start)
104 |         commits, parents, devs, dev_com, branches,\
105 |             branches_com, files, com_files, \
106 |             methods, files_methods, com_methods = \
107 |             [], [], [], [], [], [], [], [], [], [], []
108 |         for commit in \
109 |             RepositoryMining(self.config.ct.repo,
110 |                              since=self.config.ct.start_date,
111 |                              to=self.config.ct.end_date).traverse_commits():
112 |             self.drill_commit(commit, commits, parents, devs, dev_com, branches,
113 |                               branches_com, files, com_files,
114 |                               methods, files_methods, com_methods)
115 | 
116 |         data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches,
117 |                                    branches_com, files, com_files,
118 |                                    methods, files_methods, com_methods)
119 | 
120 |         print('Driller finished in: \t', datetime.now() - start)
121 | 
122 |         if save_path:
123 |             utl.save_json(save_path, data_)
124 |         if index:
125 |             self.index_batch(**data_)
126 |         return data_
127 | 
128 |     def drill_commit(self, commit, commits, parents, devs, dev_com, branches,
129 |                      branches_com, files, com_files,
130 |                      methods, files_methods, com_methods):
131 |         """Helper method - works with pass by reference"""
132 |         timestamp = commit.author_date.timestamp()
133 |         dev = utl.format_dev(commit, self.config.ct.index_developer_email)
134 |         devs.append(dev)
135 |         com = utl.format_commit(commit, self.config.ct.project_id)
136 |         commits.append(com)
137 |         dev_com.append(utl.format_author_commit(dev, com, timestamp))
138 |         for parent in commit.parents:
139 |             parents.append(utl.format_parent_commit(
140 |                 com['hash'], parent, self.config.ct.project_id))
141 |         for branch in commit.branches:
142 |             br_ = utl.format_branch(branch, self.config.ct.project_id)
143 |             branches.append(br_)
144 |             branches_com.append(
145 |                 utl.format_branch_commit(br_['hash'], com['hash']))
146 |         for file in commit.modifications:
147 |             fl_ = utl.format_file(file, self.config.ct.project_id)
148 |             files.append(fl_)
149 |             com_files.append(utl.format_commit_file(
150 |                 com['hash'], file,
151 |                 timestamp, self.config.ct.project_id, self.config.ct.index_code))
152 |             for method in file.changed_methods:
153 |                 met = utl.format_method(
154 |                     method, file, self.config.ct.project_id)
155 |                 methods.append(met)
156 |                 files_methods.append(
157 |                     utl.format_file_method(fl_['hash'], met['hash'])
158 |                 )
159 |                 com_methods.append(
160 |                     utl.format_commit_method(com['hash'], met['hash'],
161 |                                              method, timestamp))
162 | 
163 |     def data_dot_dict(self, commits, parents, devs, dev_com, branches,
164 |                       branches_com, files, com_files,
165 |                       methods, files_methods, com_methods):
166 |         """Helper method"""
167 |         return utl.Dotdict({'commits': commits,
168 |                             'parents': parents,
169 |                             'developers': devs,
170 |                             'dev_commits': dev_com,
171 |                             'branches': branches,
172 |                             'branches_commits': branches_com,
173 |                             'files': files,
174 |                             'commit_files': com_files,
175 |                             'methods': methods,
176 |                             'file_methods': files_methods,
177 |                             'commit_methods': com_methods})
178 | 
179 |     @abstractmethod
180 |     def index_batch(self):
181 |         """Abstract index batch driller method
182 |         """
183 |         raise NotImplementedError
184 | 
185 | 
186 |     def merge_all(self):
187 |       """Merges file renaming and methods"""
188 |       try:
189 |         b_utl.merge_files(self.graph, self.config.ct)
190 |       except Exception as exc:
191 |         LG.log_and_raise(exc)
192 |       else:
193 |         return
194 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/delete_all.py:
--------------------------------------------------------------------------------
 1 | # def delete_all():
 2 | #     # get total #of nodes
 3 | #     res = session.run("MATCH(n) RETURN COUNT(*) AS n")
 4 | #     total_nodes = 0
 5 | #     for item in res:
 6 | #         total_nodes = item["n"]
 7 | #     print("\n Existing nodes in db:", total_nodes)
 8 | 
 9 | #     # get total #of relationships
10 | #     res1 = session.run("MATCH (n)-[r]->() RETURN COUNT(r) as r")
11 | #     total_rels = 0
12 | #     for item in res1:
13 | #         total_rels = item["r"]
14 | #     print("\n Existing relationships in db:", total_rels)
15 | 
16 | #     # delete all nodes in batches (for faster deletion)
17 | #     while total_nodes > 0:
18 | #         res = session.run(
19 | #             "MATCH(n) WITH n LIMIT 10000 DETACH DELETE n RETURN COUNT(n) AS count")
20 | #         count = 0
21 | #         for item in res:
22 | #             count = item["count"]  # updates deleeted node count here
23 | #         total_nodes = total_nodes-count
24 | #     print("\n #of nodes in db after deletion completed = ", total_nodes)
25 | 
26 | 
27 | # start = time.time()
28 | # delete_all()
29 | # print("\n Pre cleanup time (sec): ", time.time()-start)
30 | 
31 | # for prot in fileList:
32 | #     print("\n\n", prot)
33 | #     if os.path.exists(prot+"_AllCCs_maxDist11.csv"):
34 | #         print("\n Already Processed.")
35 | #         continue
36 | #     start = time.time()
37 | #     delete_all()
38 | #     pre_time = time.time()-start
39 | #     print("\n Pre cleanup time (sec): ", pre_time)
40 | 
41 | #     # Database preparation
42 | #     session.run("CREATE INDEX ON :MyNode(Name)")
43 | 
44 | #     # 1. Create graph
45 | #     start = time.time()
46 | #     session.run("USING PERIODIC COMMIT "
47 | #                 "LOAD CSV FROM 'file:///'+{prot}+'_conflict_resolved.txt' AS line "
48 | #                 "MERGE (n:MyNode {Name:line[0]}) "
49 | #                 "MERGE (m:MyNode {Name:line[1]}) "
50 | #                 "MERGE (n) -[:TO {dist:line[2]}] -> (m) ", prot=prot)
51 | 
52 | #     end = time.time()
53 | #     step1_time = end - start
54 | #     print("\n Step 1 time (in sec) = ", end-start)
55 | 
56 | #     # 2 find CCs
57 | #     start = time.time()
58 | #     result = session.run("CALL algo.unionFind.stream('MyNode', 'TO', {graph:'huge'}) "
59 | #                          "YIELD nodeId,setId "
60 | #                          "MATCH (n) "
61 | #                          "WHERE id(n)=nodeId "
62 | #                          "WITH setId,collect(nodeId) as nodes, collect(n.Name) as labels,count(*) as size_of_component "
63 | #                          "ORDER BY size_of_component DESC "
64 | #                          "RETURN setId as componentId,size_of_component,labels as connectedTSRkeys ")
65 | #     end = time.time()
66 | #     step2_time = end - start
67 | #     print("\n Step 2 time (in sec) = ", end-start)
68 | #    # 3. save result
69 | #     start = time.time()
70 | #     # newline='' <- to avoid blank line between two rows
71 | #     with open(prot+"_AllCCs_maxDist11.csv", "w") as csvfile:
72 | #         writer = csv.writer(csvfile, delimiter=',')
73 | #         writer.writerow(
74 | #             ['componentId', 'size_of_component', 'connectedTSRkeys'])
75 | #         for record in result:
76 | #             record = str(record)[:-1].replace(", ",
77 | #                                               ",").replace("'", "").split()
78 | #             print("\n", record[1], record[2], record[3])
79 | #             writer.writerow([record[1].split("=")[1], record[2].split("=")[
80 | #                             1], record[3].split("=")[1]])
81 | #     end = time.time()
82 | #     step3_time = end - start
83 | #     print("\n Step 3 time (in sec) = ", end-start)
84 | 
85 | #     # 4. delete graph
86 | #     start = time.time()
87 | #     delete_all()
88 | #     end = time.time()
89 | #     post_time = end - start
90 | #     print("\n Post cleanup time (in sec) = ", end-start)
91 | 
92 | #     print("\n Total time = ", pre_time+step1_time +
93 | #           step2_time+step3_time+post_time)
94 | 
95 | # driver.close()
96 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/drill_cache.py:
--------------------------------------------------------------------------------
 1 | """This module saves the cache data on disk"""
 2 | import collections
 3 | from diskcache import Index
 4 | 
 5 | 
 6 | class DrillCache:
 7 |     """Class for storing all data at once in the cache"""
 8 | 
 9 |     def __init__(self, data):
10 |         """Transforms dictionary to ordered dic and saves it"""
11 |         dt_ = [(k, v) for k, v in data.items()]
12 |         self.data = Index(collections.OrderedDict(dt_))
13 | 
14 | 
15 | class DrillCacheSequential:
16 |     """Class for disk cache sequential"""
17 | 
18 |     def __init__(self):
19 |         """Init drill cache"""
20 |         self.data = Index([('commits', []),
21 |                            ('parents', []), ('developers', []),
22 |                            ('dev_commits', []), ('branches', []),
23 |                            ('branches_commits', []), ('files', []),
24 |                            ('commit_files', []), ('methods', []),
25 |                            ('file_methods', []), ('commit_methods', [])
26 |                            ])
27 | 
28 |     def append_cache(self, key, value):
29 |         """Appends record to array on disk ccache
30 |         :param key: data key
31 |         :param value: value to append
32 |         """
33 |         temp_ = self.data[key]
34 |         temp_.append(value)
35 |         self.data[key] = temp_
36 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """ This module uses pydriller to search a repository
16 | and indexes it in neo4j
17 | """
18 | from diskcache import Cache
19 | from datetime import datetime
20 | from py2neo import Graph
21 | from pydriller import RepositoryMining
22 | 
23 | import graphrepo.utils as utl
24 | import graphrepo.drillers.batch_utils as b_utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.drill_cache import DrillCacheSequential
27 | from graphrepo.drillers.default import DefaultDriller
28 | from graphrepo.logger import Logger
29 | 
30 | LG = Logger()
31 | 
32 | 
33 | class Driller(DefaultDriller):
34 |     """Drill class - parses a git repo and uses the models
35 |     to index everything in Neo4j. This class is a singleton
36 |     because it holds the connection to Neo4j in self.graph
37 |     """
38 | 
39 |     def index_batch(self, **kwargs):
40 |         """Indexes data extracted by drill_batch of from
41 |         disk in Neo4j
42 |         :param kwargs: data keys and values (see the drill_batch return)
43 |         """
44 |         try:
45 |             self.config.check_config()
46 |             self._check_connection()
47 |             b_utl.index_all(
48 |                 self.graph, config=self.config.ct, **kwargs)
49 |         except Exception as exc:
50 |             LG.log_and_raise(exc)
51 |         else:
52 |             return
53 | 
54 |     def index_from_file(self, file_path):
55 |         """Reads a file and indexes the data in Neo4j
56 |         :param file_path: the path of the JSON file with data
57 |         """
58 |         try:
59 |             data_ = utl.load_json(file_path)
60 |             self.index_batch(**data_)
61 |         except Exception as exc:
62 |             LG.log_and_raise(exc)
63 |         else:
64 |             return


--------------------------------------------------------------------------------
/graphrepo/drillers/queue_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | from abc import abstractmethod
17 | from datetime import datetime
18 | from py2neo import Graph
19 | from pydriller import RepositoryMining
20 | 
21 | import graphrepo.utils as utl
22 | from graphrepo.config import Config
23 | from graphrepo.drillers.driller import Driller
24 | import graphrepo.drillers.batch_utils as b_utl
25 | from graphrepo.logger import Logger
26 | 
27 | LG = Logger()
28 | 
29 | 
30 | class QueueDriller(Driller):
31 |     """QueueDriller class - parses a git repo and publishes
32 |     the data in a queue every n commits
33 |     """
34 | 
35 |     def __init__(self, neo, project, queue):
36 |         """Initializes the properties of this class
37 |         :param neo:
38 |         :param project:
39 |         :param queue:
40 |         """
41 |         # TODO: validate inputs
42 |         try:
43 |             self.project, self.queue = project, queue
44 |             self.config = Config()
45 |             self.graph = None
46 |             self.config.configure(**neo, **self.project)
47 |             # self._connect()
48 |         except Exception as exc:
49 |             LG.log_and_raise(exc)
50 | 
51 |     @abstractmethod
52 |     def connect_queue(self):
53 |         """Establishes a connection to queue"""
54 |         raise NotImplementedError
55 | 
56 |     @abstractmethod
57 |     def send_index_data(self, data):
58 |         """Indexes data"""
59 |         raise NotImplementedError
60 | 
61 |     def drill_batch(self, index=True, save_path=None):
62 |         """Extracts data from a software repository, with the option
63 |         of saving it on diks and indexing it in Neo4j
64 |         :param index: optional, if True, the data is indexed in Neo4j
65 |         :param save_path: optional, if given, the data is stored on dik
66 |         :returns: dictionary with all data
67 |         """
68 |         start = datetime.now()
69 |         print('Driller started at: \t', start)
70 |         commits, parents, devs, dev_com, branches,\
71 |             branches_com, files, com_files, \
72 |             methods, files_methods, com_methods = \
73 |             [], [], [], [], [], [], [], [], [], [], []
74 |         commit_index = 0
75 |         for commit in \
76 |             RepositoryMining(self.config.ct.repo,
77 |                              since=self.config.ct.start_date,
78 |                              to=self.config.ct.end_date).traverse_commits():
79 | 
80 |             self.drill_commit(commit, commits, parents, devs, dev_com, branches,
81 |                               branches_com, files, com_files,
82 |                               methods, files_methods, com_methods)
83 | 
84 |             if commit_index == self.queue['commit_batch'] - 1:
85 |                 data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches,
86 |                                            branches_com, files, com_files,
87 |                                            methods, files_methods, com_methods)
88 | 
89 |                 self.send_index_data(
90 |                     {'project_conf': self.project, 'data': data_})
91 | 
92 |                 commits, parents, devs, dev_com, branches, branches_com, files, com_files, methods, files_methods, com_methods = [
93 |                 ], [], [], [], [], [], [], [], [], [], []
94 |                 commit_index = 0
95 |             else:
96 |                 commit_index += 1
97 | 
98 |         print('Driller finished in: \t', datetime.now() - start)
99 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/rabbit_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | import json
17 | import pika
18 | 
19 | from abc import abstractmethod
20 | from datetime import datetime
21 | from py2neo import Graph
22 | from pydriller import RepositoryMining
23 | 
24 | import graphrepo.utils as utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.queue_driller import QueueDriller
27 | import graphrepo.drillers.batch_utils as b_utl
28 | from graphrepo.logger import Logger
29 | 
30 | LG = Logger()
31 | 
32 | 
33 | class RabbitDriller(QueueDriller):
34 |     """RabbitDriller class - parses a git repo and publishes
35 |     the data in a queue every n commits
36 |     """
37 | 
38 |     def connect_queue(self):
39 |         """Establishes a connection to queue"""
40 |         try:
41 |             credentials = pika.PlainCredentials(
42 |                 self.queue['username'], self.queue['password'])
43 |             self.con_parameters = pika.ConnectionParameters(self.queue['host'],
44 |                                                             self.queue['port'],
45 |                                                             self.queue['vhost'],
46 |                                                             credentials)
47 |             connection = pika.BlockingConnection(
48 |                 self.con_parameters)
49 |             channel = connection.channel()
50 | 
51 |             channel.queue_declare(queue=self.queue['queue'], durable=True)
52 |             return connection, channel
53 |         except Exception as e:
54 |             raise e
55 | 
56 |     def send_index_data(self, data):
57 |         """Indexes data"""
58 |         try:
59 |             connection, channel = self.connect_queue()
60 |             channel.basic_publish(
61 |                 exchange='',
62 |                 routing_key=self.queue['queue'],
63 |                 body=json.dumps(data),
64 |                 properties=pika.BasicProperties(
65 |                     delivery_mode=2,  # make message persistent
66 |                 ))
67 |             connection.close()
68 |         except Exception as e:
69 |             raise e
70 | 


--------------------------------------------------------------------------------
/graphrepo/drillers/stomp_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | import stomp
17 | import json
18 | 
19 | from abc import abstractmethod
20 | from datetime import datetime
21 | from py2neo import Graph
22 | from pydriller import RepositoryMining
23 | 
24 | import graphrepo.utils as utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.queue_driller import QueueDriller
27 | import graphrepo.drillers.batch_utils as b_utl
28 | from graphrepo.logger import Logger
29 | 
30 | LG = Logger()
31 | 
32 | 
33 | class StompDriller(QueueDriller):
34 |     """StompDriller class - parses a git repo and publishes
35 |     the data in a queue every n commits
36 |     """
37 | 
38 |     def connect_queue(self):
39 |         """Establishes a connection to queue"""
40 |         try:
41 |             conn = stomp.Connection(
42 |                 [(self.queue['host'], self.queue['port'])
43 |                  ], vhost=self.queue['vhost'], heartbeats=(10000, 10000)
44 |             )
45 | 
46 |             conn.connect(self.queue['username'],
47 |                          self.queue['password'], wait=True)
48 |             return conn
49 |         except Exception as e:
50 |             raise e
51 | 
52 |     def send_index_data(self, data):
53 |         """Indexes data"""
54 |         try:
55 |             conn = self.connect_queue()
56 |             conn.send(body=json.dumps(data), destination=self.queue.queue)
57 |             conn.disconnect()
58 |         except Exception as e:
59 |             raise e
60 | 


--------------------------------------------------------------------------------
/graphrepo/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Logger"""
15 | from graphrepo.singleton import Singleton
16 | 
17 | 
18 | class Logger(metaclass=Singleton):
19 |   def __init__(self, *args, **kwargs):
20 |     """Default init"""
21 | 
22 |   def log(self, exception):
23 |     """Logs exceptions and prints it to console
24 |     :param exception: Exception type from Python
25 |     """
26 |     print('[EXCEPTION]: {}'.format(exception))
27 | 
28 |   def log_and_raise(self, exception):
29 |     """Logs, prints and raises exception
30 |     :param exception: Python Exception object
31 |     """
32 |     self.log(exception)
33 |     raise exception
34 | 


--------------------------------------------------------------------------------
/graphrepo/mappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .csv import CSVMapper
2 | from .default import DefaultMapper
3 | 


--------------------------------------------------------------------------------
/graphrepo/mappers/csv.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom miner class with some abstractions"""
15 | from abc import abstractmethod
16 | import pandas as pd
17 | 
18 | from graphrepo.mappers.default import DefaultMapper
19 | 
20 | 
21 | class CSVMapper(DefaultMapper):
22 |     """The miners are currently synchronous, but
23 |     ideally they will be async in the future"""
24 | 
25 |     def map(self, objects):
26 |         """The csv default map function
27 |         assumes the objectss are of the type
28 | 
29 |         """
30 |         return pd.DataFrame(objects)
31 | 


--------------------------------------------------------------------------------
/graphrepo/mappers/default.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom mapper class with some abstractions"""
15 | from abc import abstractmethod
16 | 
17 | 
18 | class DefaultMapper():
19 |     """The miners are currently synchronous, but
20 |     ideally they will be async in the future"""
21 | 
22 |     def __init__(self, *args, **kwargs):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/graphrepo/miners/__init__.py:
--------------------------------------------------------------------------------
1 | from .commit import CommitMiner
2 | from .default import DefaultMiner
3 | from .developer import DeveloperMiner
4 | from .file import FileMiner
5 | from .mine_manager import MineManager
6 | from .method import MethodMiner
7 | 


--------------------------------------------------------------------------------
/graphrepo/miners/commit.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 NullConvergence
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """This module mines commits and contains all related Neo4j queries"""
 15 | 
 16 | from graphrepo.miners.default import DefaultMiner
 17 | from graphrepo.miners.utils import format_commit_id_date
 18 | 
 19 | 
 20 | class CommitMiner(DefaultMiner):
 21 |     """This class holds queries for commits"""
 22 | 
 23 |     def query(self, **kwargs):
 24 |         """Queries commits by any arguments given in kwargs
 25 |         For example kwargs can be {'hash': 'example-hash'}
 26 |         :param kwargs: any parameter and value, between hash, name or email
 27 |         :returns: list of commit nodes matched
 28 |         """
 29 |         com_ = self.node_matcher.match("Commit", **kwargs)
 30 |         return [dict(x) for x in com_]
 31 | 
 32 |     def get_between_dates(self, start_date, end_date,
 33 |                           project_id=None):
 34 |         """Returns all commits between start and end date
 35 |         :param start_date: timestamp, start date
 36 |         :param end_date: timestamp, end date
 37 |         :param project_id: optional; if given only the commits from a project
 38 |           are returned
 39 |         :returns: list of commitss
 40 |         """
 41 |         com_filter, where = format_commit_id_date(
 42 |             project_id, start_date, end_date)
 43 |         query = """
 44 |         MATCH (c: Commit {0})
 45 |         {1}
 46 |         RETURN distinct c
 47 |         """.format(com_filter, where)
 48 |         dt_ = self.graph.run(query)
 49 |         return [dict(x['c']) for x in dt_.data()]
 50 | 
 51 |     def get_all(self,):
 52 |         """Returns all commits
 53 |         :returns: list of commit nodes
 54 |         """
 55 |         com_ = self.node_matcher.match("Commit")
 56 |         return [dict(x) for x in com_]
 57 | 
 58 |     def get_commit_files(self, commit_hash):
 59 |         """Returns the files updated in a commit
 60 |         :param commit_hash: optional; if given, it will
 61 |           return the data only for one commit
 62 |         :returns: list of commit files
 63 |         """
 64 |         query = """
 65 |           MATCH (c:Commit {{hash: "{0}"}})
 66 |           -[UpdateFile]->(f:File)
 67 |           return distinct f
 68 |           """.format(commit_hash)
 69 |         files_ = self.graph.run(query)
 70 |         return [x['f'] for x in files_.data()]
 71 | 
 72 |     def get_commit_file_updates(self, commit_hash):
 73 |         """Returns the updates a commit made to files (UpdateFile rel)
 74 |         :param commit_hash: optional; if given, it will
 75 |           return the data only for one commit
 76 |         :returns: list of
 77 |         """
 78 |         query = """
 79 |           MATCH (c:Commit {{hash: "{0}"}})
 80 |           -[f: UpdateFile]->(fu:File)
 81 |           return distinct f
 82 |           """.format(commit_hash)
 83 |         files_ = self.graph.run(query)
 84 |         return [x['f'] for x in files_.data()]
 85 | 
 86 |     def get_commit_methods(self, commit_hash=None):
 87 |         """Returns the methods updated in a commit
 88 |         :param commit_hash: optional; if given, it will
 89 |           return the data only for one commit
 90 |         """
 91 |         query = """
 92 |           MATCH (c:Commit {{hash: "{0}"}})
 93 |           -[UpdateMethod]->(m:Method)
 94 |           return distinct m
 95 |           """.format(commit_hash)
 96 |         files_ = self.graph.run(query)
 97 |         return [x['m'] for x in files_.data()]
 98 | 
 99 |     def get_commit_method_updates(self, commit_hash=None):
100 |         """Returns the updatemethod relationships from a commit
101 |         :param commit_hash: optional; if given,
102 |           it will return the data only for one commit
103 |         :param dic: optional, boolean for ocnverting the data to dictionaries
104 |         """
105 |         query = """
106 |           MATCH (c:Commit {{hash: "{0}"}})
107 |           -[m:UpdateMethod]->(mu:Method)
108 |           return distinct m
109 |           """.format(commit_hash)
110 |         files_ = self.graph.run(query)
111 |         return [x['m'] for x in files_.data()]
112 | 


--------------------------------------------------------------------------------
/graphrepo/miners/default.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom miner class with some abstractions"""
15 | from abc import abstractmethod
16 | 
17 | 
18 | class DefaultMiner():
19 |     """The miners are currently synchronous, but
20 |     ideally they will be async in the future"""
21 | 
22 |     def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs):
23 |         self.graph = graph
24 |         self.node_matcher = node_matcher
25 |         self.rel_matcher = rel_matcher
26 | 
27 |     @abstractmethod
28 |     def get_all(self):
29 |         """This method returns all artifacts
30 |         found by a miner"""
31 |         raise NotImplementedError
32 | 


--------------------------------------------------------------------------------
/graphrepo/miners/developer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 NullConvergence
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """This module mines developers and contains all related Neo4j queries"""
 15 | 
 16 | from graphrepo.miners.default import DefaultMiner
 17 | from graphrepo.miners.utils import format_commit_id_date as fcid
 18 | 
 19 | 
 20 | class DeveloperMiner(DefaultMiner):
 21 |     """This class holds queries for the Developer nodes"""
 22 | 
 23 |     def query(self, **kwargs):
 24 |         """Queries developers by any arguments given in kwargs
 25 |         For example kwargs can be {'hash': 'example-hash'} or
 26 |         {'email': 'example-email'}
 27 |         :param kwargs: any parameter and value, between hash, name or email
 28 |         :returns: list of nodes matched
 29 |         """
 30 |         return self.node_matcher.match("Developer", **kwargs)
 31 | 
 32 |     def get_commits(self, dev_hash, project_id=None,
 33 |                     start_date=None, end_date=None):
 34 |         """Returns all commits authored by a developer.
 35 |         Optionally, it also filters by project id
 36 |         :param dev_hash: developer unique identifier
 37 |         :param project_id: optional; if present the
 38 |           query returns the commits from a project
 39 |         :param start_date: optional timestamp; filter commits
 40 |           beginning with this date
 41 |         :param end_date: optional timestamp; filter commits
 42 |           untill this date
 43 |         :returns: list of commits
 44 |         """
 45 |         com_filter, where = fcid(project_id,
 46 |                                  start_date, end_date)
 47 |         cquery = """
 48 |         MATCH (d:Developer {{hash: "{0}"}})
 49 |               -[r:Author]->
 50 |               (c:Commit {1})
 51 |         {2}
 52 |         RETURN distinct c;
 53 |         """.format(dev_hash, com_filter, where)
 54 |         dt_ = self.graph.run(cquery)
 55 |         return [dict(x['c']) for x in dt_.data()]
 56 | 
 57 |     def get_files(self, dev_hash, project_id=None,
 58 |                   start_date=None, end_date=None):
 59 |         """Returns all files edited by a developer.
 60 |         Optionally it also filters by project_id
 61 |         :params dev_hash: developer unique identifier
 62 |         :params project_id: optional; if present the query
 63 |           returns the files from a specific project
 64 |         :param start_date: optional timestamp; filter files
 65 |           beginning with this date
 66 |         :param end_date: optional timestamp; filter files
 67 |           untill this date
 68 |         :returns: list of files
 69 |         """
 70 |         com_filter, where = fcid(project_id,
 71 |                                  start_date, end_date)
 72 |         fquery = """
 73 |         MATCH (d:Developer {{hash: "{0}"}})
 74 |               -[r:Author]->
 75 |               (c:Commit {1})
 76 |               -[UpdateFile]->
 77 |               (f: File)
 78 |         {2}
 79 |         RETURN collect(distinct f);
 80 |         """.format(dev_hash, com_filter, where)
 81 |         dt_ = self.graph.run(fquery)
 82 |         return [dict(x) for x in dt_.data()[0]['collect(distinct f)']]
 83 | 
 84 |     def get_files_updates(self, dev_hash, project_id=None,
 85 |                           start_date=None, end_date=None):
 86 |         """Returns all file update information (e.g. file complexity),
 87 |         for all files edited by a developer.
 88 |         Optionally it also filters by project_id
 89 |         :params dev_hash: developer unique identifier
 90 |         :params project_id: optional; if present the query
 91 |           returns the files from a specific project
 92 |         :param start_date: optional timestamp; filter files
 93 |           beginning with this date
 94 |         :param end_date: optional timestamp; filter files
 95 |           untill this date
 96 |         :returns: list of file updates
 97 |         """
 98 |         com_filter, where = fcid(project_id,
 99 |                                  start_date, end_date)
100 |         fuquery = """
101 |         MATCH (d:Developer {{hash: "{0}"}})
102 |               -[r:Author]->
103 |               (c:Commit {1})
104 |               -[fu: UpdateFile]->
105 |               (f: File)
106 |         {2}
107 |         RETURN distinct fu;
108 |         """.format(dev_hash, com_filter, where)
109 | 
110 |         dt_ = self.graph.run(fuquery)
111 |         return [dict(x['fu']) for x in dt_.data()]
112 | 
113 |     def get_methods(self, dev_hash, project_id=None,
114 |                     start_date=None, end_date=None):
115 |         """Returns all methods updated by a developer.
116 |         Optionally it also filters by project_id
117 |         :params dev_hash: developer unique identifier
118 |         :params project_id: optional; if present the query
119 |           returns the files from a specific project
120 |         :param start_date: optional timestamp; filter files
121 |           beginning with this date
122 |         :param end_date: optional timestamp; filter files
123 |           untill this date
124 |         :returns: list of methods
125 |         """
126 |         com_filter, where = fcid(project_id,
127 |                                  start_date, end_date)
128 |         mquery = """
129 |         MATCH (d:Developer {{hash: "{0}"}})
130 |               -[r:Author]->
131 |               (c:Commit {1})
132 |               -[um: UpdateMethod]->
133 |               (m: Method)
134 |         {2}
135 |         RETURN distinct m;
136 |         """.format(dev_hash, com_filter, where)
137 | 
138 |         dt_ = self.graph.run(mquery)
139 |         return [dict(x['m']) for x in dt_.data()]
140 | 
141 |     def get_method_updates(self, dev_hash, project_id=None,
142 |                            start_date=None, end_date=None):
143 |         """Returns all method update information, for all
144 |         methods update by a developer.
145 |         Optionally it also filters by project_id
146 |         :params dev_hash: developer unique identifier
147 |         :params project_id: optional; if present the query
148 |           returns the files from a specific project
149 |         :param start_date: optional timestamp; filter files
150 |           beginning with this date
151 |         :param end_date: optional timestamp; filter files
152 |           untill this date
153 |         :returns: list of method updates
154 |         """
155 |         com_filter, where = fcid(project_id,
156 |                                  start_date, end_date)
157 |         muquery = """
158 |         MATCH (d:Developer {{hash: "{0}"}})
159 |               -[r:Author]->
160 |               (c:Commit {1})
161 |               -[um: UpdateMethod]->
162 |               ()
163 |         {2}
164 |         RETURN distinct um;
165 |         """.format(dev_hash, com_filter, where)
166 | 
167 |         dt_ = self.graph.run(muquery)
168 |         return [dict(x['um']) for x in dt_.data()]
169 | 
170 |     def get_all(self):
171 |         return self.node_matcher.match("Developer")
172 | 


--------------------------------------------------------------------------------
/graphrepo/miners/file.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines files and contains all related Neo4j queries"""
15 | 
16 | from graphrepo.miners.default import DefaultMiner
17 | 
18 | 
19 | class FileMiner(DefaultMiner):
20 |     """This clas holds queries for the File nodes"""
21 | 
22 |     def query(self, **kwargs):
23 |         """Searches for a file using the arguments in kwargs.
24 |         If no kwargs are given it returns the first file found
25 |         """
26 |         return self.node_matcher.match("File", **kwargs).first()
27 | 
28 |     def get_all(self):
29 |         """Returns all node of type File
30 |         :return: list of files
31 |         """
32 |         return self.node_matcher.match("File")
33 | 
34 |     def get_change_history(self, file_hash):
35 |         """Returns all updated relationships
36 |         :param file_hash: a string, unique identifier for file
37 |         :param dic: optional; boolean for converting data to dictionary
38 |           or returning it as py2neo records - the py2neo raw
39 |           records can be used in mappers
40 |         :return: list of update file relationships
41 |         """
42 |         query = """MATCH ()-[r:UpdateFile]->(f:File {{hash: "{0}"}})
43 |         return distinct r
44 |         """.format(file_hash)
45 |         dt_ = self.graph.run(query)
46 |         return [dict(x['r']) for x in dt_.data()]
47 | 
48 |     def get_current_methods(self, file_hash):
49 |         """Returns all current methods
50 |         :param file_hash: a string, unique identifier for file
51 |         :param dic: optional; boolean for converting data to dictionary
52 |           or returning it as py2neo records - the py2neo raw
53 |           records can be used in mappers
54 |         :return: list of methods
55 |         """
56 |         query = """MATCH (f:File {{hash: "{0}"}})-[r:Method]->(m:Method)
57 |         return distinct m
58 |         """.format(file_hash)
59 |         dt_ = self.graph.run(query)
60 |         return [dict(x['m']) for x in dt_.data()]
61 | 
62 |     def get_past_methods(self, file):
63 |         """Returns methods that were removed from the file
64 |           :param file: Py2Neo File object
65 |           :returrn: list of Method objects
66 |           """
67 |         # return [rel.end_node
68 |         #         for rel in self.graph.match([file, None], "HadMethod")]
69 | 


--------------------------------------------------------------------------------
/graphrepo/miners/method.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines files and contains all related Neo4j queries"""
15 | 
16 | from graphrepo.miners.default import DefaultMiner
17 | 
18 | 
19 | class MethodMiner(DefaultMiner):
20 |     def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs):
21 |         super().__init__(graph, node_matcher, rel_matcher, *args, **kwargs)
22 | 
23 |     def query(self, **kwargs):
24 |         """Searches for a method using the arguments in kwargs.
25 |         If no kwargs are given it returns the first method found
26 |         """
27 |         return self.node_matcher.match("Method", **kwargs).first()
28 | 
29 |     def get_all(self):
30 |         """Returns all node of type Method
31 |         :return: list of method
32 |         """
33 |         return self.node_matcher.match("Method")
34 | 
35 |     def get_change_history(self, method_hash):
36 |         """Returns all UpdateMethod relationships
37 |           :param method_hash: method unique identifier
38 |           :param dic: optional; boolean for converting data to dictionary
39 |           or returning it as py2neo records - the py2neo raw
40 |           records can be used in mappers
41 |           :return: list of UpdateMethod relationships / dics
42 |           """
43 |         query = """MATCH ()-[r:UpdateMethod]->(m: Method{{hash: "{0}"}})
44 |         RETURN distinct r
45 |           """.format(method_hash)
46 |         dt_ = self.graph.run(query)
47 |         return [dict(x['r']) for x in dt_.data()]
48 | 


--------------------------------------------------------------------------------
/graphrepo/miners/mine_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 NullConvergence
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """This module initializes and configures all miners"""
 15 | from py2neo import Graph, NodeMatcher, RelationshipMatcher
 16 | import graphrepo.utils as utl
 17 | from graphrepo.config import Config
 18 | from graphrepo.logger import Logger
 19 | from graphrepo.singleton import Singleton
 20 | from graphrepo import miners
 21 | 
 22 | 
 23 | LG = Logger()
 24 | 
 25 | 
 26 | class MineManager(metaclass=Singleton):
 27 |     """MineManageer class - This class manages custom
 28 |     miners. At the moment we instantiate all miners,
 29 |     but other managers which handle different 'teams of miners'
 30 |     can be created.
 31 |     """
 32 | 
 33 |     def __init__(self, config_path):
 34 |         """Initializes the properties of this class"""
 35 |         self.commit_miner, self.dev_miner, \
 36 |             self.file_miner, self.method_miner = None, None, None, None
 37 |         try:
 38 |             if not config_path:
 39 |                 raise FileNotFoundError
 40 |             neo, project = utl.parse_config(config_path)
 41 |             self.config = Config()
 42 |             self.config.configure(**neo, **project)
 43 |             self.graph = None
 44 |             self.node_matcher = None
 45 |             self.rel_matcher = None
 46 |             self.connect()
 47 |         except Exception as exc:
 48 |             LG.log_and_raise(exc)
 49 | 
 50 |     def connect(self):
 51 |         """Instantiates the connection to Neo4j and stores
 52 |         the graph internally.
 53 |         Throws exception if the connection can not pe realized
 54 |         """
 55 |         try:
 56 |             self.graph = Graph(host=self.config.ct.db_url,
 57 |                                user=self.config.ct.db_user,
 58 |                                password=self.config.ct.db_pwd,
 59 |                                http_port=self.config.ct.port)
 60 |             self.node_matcher = NodeMatcher(self.graph)
 61 |             self.rel_matcher = RelationshipMatcher(self.graph)
 62 |             self.init_miners()
 63 |         except Exception as exc:
 64 |             LG.log_and_raise(exc)
 65 | 
 66 |     def check_connection(self):
 67 |         """Checks if there is a db connection and raises
 68 |         ReferenceError if not.
 69 |         """
 70 |         try:
 71 |             self.connect()
 72 |         except:
 73 |             raise ReferenceError("There is no valid "
 74 |                                  "database connection. Please "
 75 |                                  "configure and connect first.")
 76 | 
 77 |     def init_miners(self):
 78 |         """Initializes all miners"""
 79 |         try:
 80 |             # TODO: Parse this automatically?
 81 |             self.commit_miner = miners.CommitMiner(
 82 |                 graph=self.graph,
 83 |                 node_matcher=self.node_matcher,
 84 |                 rel_matcher=self.rel_matcher)
 85 |             self.dev_miner = \
 86 |                 miners.DeveloperMiner(graph=self.graph,
 87 |                                       node_matcher=self.node_matcher,
 88 |                                       rel_matcher=self.rel_matcher)
 89 |             self.file_miner = \
 90 |                 miners.FileMiner(graph=self.graph,
 91 |                                  node_matcher=self.node_matcher,
 92 |                                  rel_matcher=self.rel_matcher)
 93 |             self.method_miner = \
 94 |                 miners.MethodMiner(graph=self.graph,
 95 |                                    node_matcher=self.node_matcher,
 96 |                                    rel_matcher=self.rel_matcher)
 97 | 
 98 |         except Exception as exc:
 99 |             LG.log_and_raise(exc)
100 |         else:
101 |             return
102 | 
103 |     def get_all_data(self):
104 |         """Returns all nodes and relationships from Neo4j
105 |         :returns:  a tuple with two arrays: the first with nodes,
106 |             the second with relationships
107 |         """
108 |         nodes = self.node_matcher.match()
109 |         rels = self.rel_matcher.match()
110 | 
111 |         return list(nodes), list(rels)
112 | 


--------------------------------------------------------------------------------
/graphrepo/miners/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils methods for miners"""
 2 | 
 3 | 
 4 | def format_commit_id_date(project_id, start_date, end_date, commit_hash=None):
 5 |     """Formats commit query with id and dates
 6 |     :param project_id: the project unique identifier
 7 |     :param start_date: timestamp, commit start_date
 8 |     :param end_date: timestamp, commit end_date
 9 |     :param ccommit_hash: optional, if given the query
10 |       filters by commit hash
11 |     :returns: query filter string and where clause
12 |     """
13 |     com_filter, where = "", ""
14 |     if project_id and not commit_hash:
15 |         com_filter += """{{project_id: "{0}"}}""".format(project_id)
16 |     if project_id and commit_hash:
17 |         com_filter += """{{project_id: "{0}", hash: {1}}}""".format(
18 |             project_id, commit_hash)
19 |     if start_date:
20 |         where += "c.timestamp >= {0}".format(start_date)
21 |     if end_date:
22 |         where += " AND " if where else ""
23 |         where += "c.timestamp <= {0}".format(end_date)
24 |     where = "WHERE " + where if where else where
25 | 
26 |     return com_filter, where
27 | 


--------------------------------------------------------------------------------
/graphrepo/singleton.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 GraphRepo
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Singleton metaclass"""
16 | 
17 | 
18 | class Singleton(type):
19 |     """
20 |     Define an Instance operation that lets clients access its unique
21 |     instance.
22 |     """
23 | 
24 |     def __init__(cls, name, bases, attrs, *args, **kwargs):
25 |         super().__init__(name, bases, attrs)
26 |         cls._instance = None
27 | 
28 |     def __call__(cls, *args, **kwargs):
29 |         if cls._instance is None:
30 |             cls._instance = super().__call__(*args, **kwargs)
31 |         return cls._instance
32 | 


--------------------------------------------------------------------------------
/graphrepo/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 GraphRepo
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Utils methods for GraphRepo"""
 15 | import json
 16 | import hashlib
 17 | from datetime import datetime
 18 | import yaml
 19 | 
 20 | 
 21 | class Dotdict(dict):
 22 |     """dot.notation access to dictionary attributes"""
 23 |     __getattr__ = dict.get
 24 |     __setattr__ = dict.__setitem__
 25 |     __delattr__ = dict.__delitem__
 26 | 
 27 | 
 28 | def parse_config(path):
 29 |     with open(path, 'r') as ymlfile:
 30 |         conf = yaml.load(ymlfile, Loader=yaml.FullLoader)
 31 | 
 32 |     neo = conf['neo']
 33 |     project = conf['project']
 34 | 
 35 |     project['start_date'] = datetime.strptime(
 36 |         project['start_date'], '%d %B, %Y %H:%M') \
 37 |         if project['start_date'] else None
 38 |     project['end_date'] = datetime.strptime(
 39 |         project['end_date'], '%d %B, %Y %H:%M') \
 40 |         if project['end_date'] else None
 41 | 
 42 |     return neo, project
 43 | 
 44 | 
 45 | def save_json(path, data):
 46 |     with open(path, 'w') as outfile:
 47 |         json.dump(data, outfile)
 48 | 
 49 | 
 50 | def load_json(path):
 51 |     with open(path) as json_file:
 52 |         return json.load(json_file)
 53 | 
 54 | 
 55 | def get_file_hash(file, project_id=None, use_new_path=False):
 56 |     name = ''
 57 |     if not file.old_path and file.new_path:
 58 |         # ADD File
 59 |         name = name + file.new_path
 60 |     elif file.old_path and not file.new_path:
 61 |         # DELETE
 62 |         name = name+file.old_path
 63 |     elif file.old_path and file.new_path:
 64 |         # MODIFY OR RENAME
 65 |         if use_new_path:
 66 |             name = name + file.new_path
 67 |         else:
 68 |             name = name + file.old_path
 69 | 
 70 |     name = name+file.filename
 71 |     name = project_id + name if project_id else name
 72 |     return hashlib.sha224(str(name).encode('utf-8')).hexdigest()
 73 | 
 74 | 
 75 | def get_method_type(method, m_before, m_current):
 76 |     if method.name in m_before and method.name not in m_current:
 77 |         return "DELETE"
 78 |     elif method.name in m_before and method.name in m_current:
 79 |         return "MODIFY"
 80 |     else:
 81 |         return "ADD"
 82 | 
 83 | 
 84 | def get_method_hash(method, file, project_id=None):
 85 |     fhash = get_file_hash(file, project_id)
 86 |     _fmname = fhash + "_" + method.name
 87 |     _fmname = project_id + _fmname if project_id else _fmname
 88 |     return hashlib.sha224(_fmname.encode('utf-8')).hexdigest()
 89 | 
 90 | 
 91 | def get_author_hash(email):
 92 |     return hashlib.sha224(email.encode('utf-8')).hexdigest()
 93 | 
 94 | 
 95 | def format_dev(dev, index_email=True):
 96 |     return {
 97 |         'name': dev.author.name,
 98 |         'email': dev.author.email if index_email else '',
 99 |         'hash': get_author_hash(dev.author.email)
100 |     }
101 | 
102 | 
103 | def get_commit_hash(chash, project_id):
104 |     return hashlib.sha224(str(project_id + chash).encode('utf-8')).hexdigest()
105 | 
106 | 
107 | def format_commit(com, project_id):
108 |     return {
109 |         'hash': get_commit_hash(com.hash, project_id),
110 |         'commit_hash': com.hash,
111 |         'message': com.msg,
112 |         'is_merge': 1 if com.merge else 0,
113 |         'timestamp': com.author_date.timestamp(),
114 |         'project_id': project_id,
115 |         'dmm_unit_complexity': com.dmm_unit_complexity if com.dmm_unit_complexity else -1,
116 |         'dmm_unit_interfacing': com.dmm_unit_interfacing if com.dmm_unit_interfacing else -1,
117 |         'dmm_unit_size': com.dmm_unit_size if com.dmm_unit_size else -1,
118 |     }
119 | 
120 | 
121 | def format_parent_commit(c_hash, parent_hash, project_id=None):
122 |     return {
123 |         'child_hash': c_hash,
124 |         'parent_hash': get_commit_hash(parent_hash, project_id)
125 |     }
126 | 
127 | 
128 | def format_branch(name, project_id):
129 |     return {
130 |         'hash':  hashlib.sha224(str(project_id+name).encode('utf-8')).hexdigest(),
131 |         'project_id': project_id,
132 |         'name': name
133 |     }
134 | 
135 | 
136 | def format_author_commit(dev, com, timestamp):
137 |     return {'commit_hash': com['hash'],
138 |             'author_hash': dev['hash'],
139 |             'timestamp': timestamp,
140 |             }
141 | 
142 | 
143 | def format_branch_commit(bhash, chash):
144 |     return {'branch_hash': bhash,
145 |             'commit_hash': chash
146 |             }
147 | 
148 | 
149 | def format_file(file, project_id):
150 |     return {
151 |         'hash': get_file_hash(file, project_id),
152 |         'merge_hash': get_file_hash(file, project_id, use_new_path=True),
153 |         'name': file.filename,
154 |         'project_id': project_id,
155 |         'type': '.' + file.filename.split('.')[-1:][0]
156 |     }
157 | 
158 | 
159 | def format_commit_file(c_hash, file, timestamp, project_id, index_code=True):
160 |     f_hash = get_file_hash(file, project_id)
161 |     f_merge_hash = get_file_hash(file, project_id, use_new_path=True)
162 |     dt_ = {'commit_hash': c_hash, 'file_hash': f_hash,
163 |            'attributes': {
164 |                'timestamp': timestamp,
165 |                'old_path': file.old_path if file.old_path else '',
166 |                'path': file.new_path if file.new_path else '',
167 |                'source_code': '',
168 |                'source_code_before': '',
169 |                'diff': file.diff,
170 |                'nloc': file.nloc if file.nloc else -1,
171 |                'complexity': file.complexity if file.complexity else -1,
172 |                'token_count': file.token_count if file.token_count else -1,
173 |                'added': file.added,
174 |                'removed': file.removed,
175 |                'type': file.change_type.name,
176 |                'f_hash': f_hash,
177 |                'm_hash': f_merge_hash}}
178 | 
179 |     if index_code:
180 |         dt_['attributes']['source_code'] = str(
181 |             file.source_code) if file.source_code else '',
182 |         dt_['attributes']['source_code_before'] = str(
183 |             file.source_code_before) if file.source_code_before else '',
184 | 
185 |     return dt_
186 | 
187 | 
188 | def format_commit_method(c_hash, m_hash, met, timestamp):
189 |     return {
190 |         'commit_hash': c_hash,
191 |         'method_hash': m_hash,
192 |         'attributes': {
193 |             'timestamp': timestamp,
194 |             'long_name': met.long_name,
195 |             'parameters': met.parameters,
196 |             'complexity': met.complexity,
197 |             'nloc': met.nloc,
198 |             'fan_in': met.fan_in,
199 |             'fan_out': met.fan_out,
200 |             'general_fan_out': met.general_fan_out,
201 |             'length': met.length,
202 |             'token_count': met.token_count,
203 |             'start_line': met.start_line,
204 |             'end_line': met.end_line}}
205 | 
206 | 
207 | def format_method(met, fille, project_id):
208 |     return {
209 |         'hash': get_method_hash(met, fille, project_id),
210 |         'name': met.name,
211 |         'file_name': met.filename,
212 |         'project_id': project_id}
213 | 
214 | 
215 | def format_file_method(f_hash, m_hash):
216 |     return {'file_hash': f_hash, 'method_hash': m_hash}
217 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | lizard==1.16.6
 2 | pytz==2018.9
 3 | psutil==5.7.0
 4 | py2neo==4.3.0
 5 | pydriller==1.15.1
 6 | requests==2.21.0
 7 | pytest==5.3.5
 8 | GitPython==3.1.0
 9 | PyYAML==5.3.1
10 | diskcache==4.1.0
11 | pika==1.1.0
12 | stomp.py==6.1.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # v0.3.5 released
 2 | from setuptools import setup, find_packages
 3 | 
 4 | with open('requirements.txt') as reqs_file:
 5 |     requirements = reqs_file.read().splitlines()
 6 | 
 7 | setup(name="graphrepo",
 8 |       version="0.3.5",
 9 |       description="A tool that maps a Github repo to Neo4j and Helps Mining the Repo in the DB",
10 |       url="https://github.com/NullConvergence/GraphRepo",
11 |       license='Apache License',
12 |       python_requires='>=3.5',
13 |       install_requires=requirements,
14 |       packages=find_packages('.'),
15 |       package_dir={'graphrepo': 'graphrepo'})
16 | 
17 | # python3 setup.py sdist bdist_wheel
18 | # python3 -m twine upload dist/*
19 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/__init__.py


--------------------------------------------------------------------------------
/tests/cnfg_init.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 200
 7 | 
 8 | project:
 9 |   repo: tests/gr-test
10 |   start_date: "14 May, 2020 00:00"
11 |   end_date: "15 May, 2020 23:00"
12 |   project_id: 'graph_repo_test'
13 |   index_code: False
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/tests/cnfg_simple.yml:
--------------------------------------------------------------------------------
 1 | neo:
 2 |   db_url: localhost
 3 |   port: 7687
 4 |   db_user: neo4j
 5 |   db_pwd: neo4jj
 6 |   batch_size: 200
 7 | 
 8 | project:
 9 |   repo: tests/gr-test
10 |   start_date: "14 May, 2020 00:00"
11 |   end_date: "15 May, 2020 02:00"
12 |   project_id: 'graph_repo_test'
13 |   index_code: True
14 |   index_developer_email: True


--------------------------------------------------------------------------------
/tests/test_cache_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from graphrepo.drillers.cache_driller import CacheDriller
18 | 
19 | 
20 | class TestCacheDriller:
21 |     def test_indexing(self):
22 |         folder = os.path.dirname(os.path.abspath(__file__))
23 |         test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml'))
24 |         test_driller.drill_batch_cache_sequential()
25 |         records = [r for r in test_driller.graph.run(
26 |             "MATCH(n) RETURN n")]
27 |         assert len(records) == 22
28 | 
29 |         test_driller.clean()
30 | 
31 |     def test_drill_batch_cache(self):
32 |         folder = os.path.dirname(os.path.abspath(__file__))
33 |         test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml'))
34 |         test_driller.drill_batch_cache_all()
35 |         records = [r for r in test_driller.graph.run(
36 |             "MATCH(n) RETURN n")]
37 |         assert len(records) == 22
38 | 
39 |         test_driller.clean()
40 | 


--------------------------------------------------------------------------------
/tests/test_commit.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 NullConvergence
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import pytest
 17 | import yaml
 18 | 
 19 | from py2neo import NodeMatcher, RelationshipMatcher
 20 | from graphrepo.drillers.driller import Driller
 21 | from graphrepo.drillers.cache_driller import CacheDriller
 22 | 
 23 | 
 24 | class TestCommit:
 25 |     """Most data is indexed when indexing a commmit
 26 |     so this class tests indexing for multiple models"""
 27 | 
 28 |     def test_nodes_index(self):
 29 |         folder = os.path.dirname(os.path.abspath(__file__))
 30 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
 31 |         test_driller.drill_batch()
 32 | 
 33 |         # test that all nodes were indexed
 34 |         node_matcher = NodeMatcher(test_driller.graph)
 35 |         all_commits = list(node_matcher.match("Commit"))
 36 |         assert len(all_commits) == 8
 37 | 
 38 |         all_devs = list(node_matcher.match("Developer"))
 39 |         assert len(all_devs) == 2
 40 | 
 41 |         all_files = list(node_matcher.match("File"))
 42 |         assert len(all_files) == 6
 43 | 
 44 |         all_methods = list(node_matcher.match("Method"))
 45 |         assert len(all_methods) == 5
 46 | 
 47 |         all_branches = list(node_matcher.match("Branch"))
 48 |         assert len(all_branches) == 1
 49 | 
 50 |         test_driller.clean()
 51 | 
 52 |     def test_rel_index(self):
 53 |         folder = os.path.dirname(os.path.abspath(__file__))
 54 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
 55 |         test_driller.drill_batch()
 56 | 
 57 |         # test that all relationships were indexed
 58 |         rel_matcher = RelationshipMatcher(test_driller.graph)
 59 | 
 60 |         all_branch = list(rel_matcher.match(None, "BranchCommit"))
 61 |         assert len(all_branch) == 8
 62 | 
 63 |         all_authorship = list(rel_matcher.match(None, "Author"))
 64 |         assert len(all_authorship) == 8
 65 | 
 66 |         all_parent = list(rel_matcher.match(None, "Parent"))
 67 |         assert len(all_parent) == 8
 68 | 
 69 |         all_updadedfile = list(rel_matcher.match(None, "UpdateFile"))
 70 |         assert len(all_updadedfile) == 9
 71 | 
 72 |         all_hasmethod = list(rel_matcher.match(None, "Method"))
 73 |         assert len(all_hasmethod) == 5
 74 | 
 75 |         all_updatemethod = list(rel_matcher.match(None, "UpdateMethod"))
 76 |         assert len(all_updatemethod) == 9
 77 | 
 78 |         test_driller.clean()
 79 | 
 80 |     def test_rel_index_cache(self):
 81 |         folder = os.path.dirname(os.path.abspath(__file__))
 82 |         test_driller = CacheDriller(os.path.join(folder, 'cnfg_simple.yml'))
 83 |         test_driller.drill_batch_cache_sequential()
 84 | 
 85 |         # test that all relationships were indexed
 86 |         rel_matcher = RelationshipMatcher(test_driller.graph)
 87 | 
 88 |         all_branch = list(rel_matcher.match(None, "BranchCommit"))
 89 |         assert len(all_branch) == 8
 90 | 
 91 |         all_authorship = list(rel_matcher.match(None, "Author"))
 92 |         assert len(all_authorship) == 8
 93 | 
 94 |         all_parent = list(rel_matcher.match(None, "Parent"))
 95 |         assert len(all_parent) == 8
 96 | 
 97 |         all_updadedfile = list(rel_matcher.match(None, "UpdateFile"))
 98 |         assert len(all_updadedfile) == 9
 99 | 
100 |         all_hasmethod = list(rel_matcher.match(None, "Method"))
101 |         assert len(all_hasmethod) == 5
102 | 
103 |         all_updatemethod = list(rel_matcher.match(None, "UpdateMethod"))
104 |         assert len(all_updatemethod) == 9
105 | 
106 |         test_driller.clean()
107 | 
108 |     def test_custom_attributes_rel(self):
109 |         folder = os.path.dirname(os.path.abspath(__file__))
110 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
111 |         test_driller.drill_batch()
112 | 
113 |         node_matcher = NodeMatcher(test_driller.graph)
114 |         rel_matcher = RelationshipMatcher(test_driller.graph)
115 | 
116 |         commit = node_matcher.match(
117 |             "Commit", hash="aa6fa504ccb0fa919acc3cb31e510dc2048314eb0656f34babada15c").first()
118 |         assert commit['is_merge'] == 0
119 | 
120 |         update_file_rel = rel_matcher.match([commit], "UpdateFile").first()
121 |         assert update_file_rel['complexity'] == 2
122 |         assert update_file_rel['nloc'] == 8
123 |         assert update_file_rel['old_path'] == 'gr_test/default_class.py'
124 |         assert update_file_rel['path'] == 'gr_test/default_class.py'
125 |         assert update_file_rel['token_count'] == 42
126 |         assert update_file_rel['type'] == 'MODIFY'
127 |         assert update_file_rel['removed'] == 6
128 |         assert update_file_rel['added'] == 0
129 | 
130 |         update_method_rel = rel_matcher.match(
131 |             [commit], 'UpdateMethod').first()
132 |         # assert update_method_rel['type'] == 'DELETE'
133 |         assert update_method_rel['nloc'] == 5
134 |         assert update_method_rel['complexity'] == 2
135 |         assert update_method_rel['token_count'] == 21
136 |         assert update_method_rel['length'] == 5
137 |         assert update_method_rel['fan_in'] == 0
138 |         assert update_method_rel['fan_out'] == 0
139 |         assert update_method_rel['start_line'] == 11
140 |         assert update_method_rel['end_line'] == 15
141 | 
142 |         test_driller.clean()
143 | 


--------------------------------------------------------------------------------
/tests/test_commit_miner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the 'License');
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from datetime import datetime
15 | import os
16 | 
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.commit import CommitMiner
20 | 
21 | 
22 | class TestCommitMiner:
23 |     def test_gets(self):
24 |         folder = os.path.dirname(os.path.abspath(__file__))
25 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 |         test_driller.drill_batch()
27 | 
28 |         st_date = datetime.strptime(
29 |             '14 May, 2020 00:00', '%d %B, %Y %H:%M').timestamp()
30 |         end_date = datetime.strptime(
31 |             '15 May, 2020 02:00', '%d %B, %Y %H:%M').timestamp()
32 | 
33 |         n_matcher = NodeMatcher(test_driller.graph)
34 |         r_matcher = RelationshipMatcher(test_driller.graph)
35 | 
36 |         com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher)
37 | 
38 |         all_com = com_miner.get_all()
39 |         assert len(all_com) == 8
40 | 
41 |         all_com_dates = com_miner.get_between_dates(st_date, end_date)
42 |         assert len(all_com_dates) == 8
43 | 
44 |         c_files = com_miner.get_commit_files(
45 |             'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
46 |         assert len(c_files) == 3
47 | 
48 |         c_file_updates = com_miner.get_commit_file_updates(
49 |             'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
50 |         assert len(c_file_updates) == 3
51 | 
52 |         c_methods = com_miner.get_commit_methods(
53 |             'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
54 |         assert len(c_methods) == 3
55 | 
56 |         c_method_updates = com_miner.get_commit_method_updates(
57 |             'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
58 |         assert len(c_method_updates) == 3
59 | 
60 |         test_driller.clean()
61 | 


--------------------------------------------------------------------------------
/tests/test_csv_mapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers import Driller
19 | from graphrepo.mappers import CSVMapper
20 | from graphrepo.miners import CommitMiner
21 | 
22 | 
23 | class TestCSVMapper:
24 |     """Most data is indexed when indexing a commmit
25 |     so this class tests indexing for multiple models"""
26 | 
27 |     def test_csv_mapper(self):
28 |         folder = os.path.dirname(os.path.abspath(__file__))
29 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
30 |         test_driller.drill_batch()
31 | 
32 |         n_matcher = NodeMatcher(test_driller.graph)
33 |         r_matcher = RelationshipMatcher(test_driller.graph)
34 | 
35 |         com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher)
36 |         mapper = CSVMapper()
37 | 
38 |         commits = com_miner.get_all()
39 |         mapped_commits = mapper.map(commits)
40 |         assert mapped_commits.shape == (8, 9)
41 | 
42 |         c_files = com_miner.get_commit_files(
43 |             'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
44 |         c_csv = mapper.map(c_files)
45 |         assert c_csv.shape == (3, 5)
46 | 
47 |         test_driller.clean()
48 | 


--------------------------------------------------------------------------------
/tests/test_db_init.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from graphrepo.drillers.default import DefaultDriller
 4 | import graphrepo.drillers.db_init as db_init
 5 | 
 6 | from py2neo.database import Schema
 7 | 
 8 | 
 9 | class TestDBInit:
10 |     def test_hash_constraints(self):
11 |         folder = os.path.dirname(os.path.abspath(__file__))
12 |         test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml'))
13 | 
14 |         db_init.create_hash_constraints(test_driller.graph)
15 | 
16 |         schm = Schema(test_driller.graph)
17 | 
18 |         labels = ["Developer", "Branch", "Commit", "File", "Method"]
19 | 
20 |         for l in labels:
21 |             c = schm.get_uniqueness_constraints(l)
22 |             assert len(c) == 1
23 | 
24 |         # clean
25 |         for l in labels:
26 |             schm.drop_uniqueness_constraint(l, 'hash')
27 | 
28 |     def test_indices(self):
29 |         folder = os.path.dirname(os.path.abspath(__file__))
30 |         test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml'))
31 | 
32 |         db_init.create_indices(test_driller.graph, hash_index=True)
33 | 
34 |         schm = Schema(test_driller.graph)
35 | 
36 |         index_authors = schm.get_indexes("Developer")
37 |         assert len(index_authors) == 1
38 | 
39 |         index_branch = schm.get_indexes("Branch")
40 |         assert len(index_branch) == 2
41 | 
42 |         index_commits = schm.get_indexes("Commit")
43 |         assert len(index_commits) == 2
44 | 
45 |         index_files = schm.get_indexes("File")
46 |         assert len(index_files) == 3
47 | 
48 |         index_methods = schm.get_indexes("Method")
49 |         assert len(index_methods) == 3
50 | 
51 |         # clean
52 |         schm.drop_index("Developer", "hash")
53 |         schm.drop_index("Branch", "hash")
54 |         schm.drop_index("Branch", "project_id")
55 |         schm.drop_index("Commit", "hash")
56 |         schm.drop_index("Commit", "project_id")
57 |         schm.drop_index("File", "hash")
58 |         schm.drop_index("File", "project_id")
59 |         schm.drop_index("Method", "hash")
60 |         schm.drop_index("Method", "project_id")
61 | 


--------------------------------------------------------------------------------
/tests/test_dev_miner.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 NullConvergence
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from datetime import datetime
 16 | import os
 17 | 
 18 | from py2neo import NodeMatcher, RelationshipMatcher
 19 | from graphrepo.drillers.driller import Driller
 20 | from graphrepo.miners.developer import DeveloperMiner
 21 | 
 22 | 
 23 | class TestDevMiner:
 24 |     def test_gets(self):
 25 |         folder = os.path.dirname(os.path.abspath(__file__))
 26 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
 27 |         test_driller.drill_batch()
 28 | 
 29 |         st_date = datetime.strptime(
 30 |             "14 May, 2020 00:00", '%d %B, %Y %H:%M').timestamp()
 31 |         end_date = datetime.strptime(
 32 |             "15 May, 2020 02:00", '%d %B, %Y %H:%M').timestamp()
 33 | 
 34 |         n_matcher = NodeMatcher(test_driller.graph)
 35 |         r_matcher = RelationshipMatcher(test_driller.graph)
 36 | 
 37 |         dev_miner = DeveloperMiner(test_driller.graph, n_matcher, r_matcher)
 38 | 
 39 |         all_devs = dev_miner.get_all()
 40 |         assert len(all_devs) == 2
 41 | 
 42 |         all_commits = dev_miner.get_commits(
 43 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb")
 44 |         assert len(all_commits) == 7
 45 | 
 46 |         all_com_id = dev_miner.get_commits(
 47 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
 48 |             project_id=test_driller.config.ct.project_id
 49 |         )
 50 |         assert len(all_com_id) == 7
 51 | 
 52 |         all_com_id_dates = dev_miner.get_commits(
 53 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
 54 |             project_id=test_driller.config.ct.project_id,
 55 |             start_date=st_date,
 56 |             end_date=end_date
 57 |         )
 58 |         assert len(all_com_id_dates) == 7
 59 | 
 60 |         all_files = dev_miner.get_files(
 61 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
 62 |         )
 63 |         assert len(all_files) == 6
 64 | 
 65 |         all_files_id_dates = dev_miner.get_files(
 66 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
 67 |             project_id=test_driller.config.ct.project_id,
 68 |             start_date=st_date,
 69 |             end_date=end_date
 70 |         )
 71 |         assert len(all_files_id_dates) == 6
 72 | 
 73 |         files_updates = dev_miner.get_files_updates(
 74 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
 75 |         )
 76 |         assert len(files_updates) == 9
 77 | 
 78 |         files_updates_id_dates = dev_miner.get_files_updates(
 79 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
 80 |             project_id=test_driller.config.ct.project_id,
 81 |             start_date=st_date,
 82 |             end_date=end_date
 83 |         )
 84 |         assert len(files_updates_id_dates) == 9
 85 | 
 86 |         all_methods = dev_miner.get_methods(
 87 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
 88 |         )
 89 |         assert len(all_methods) == 5
 90 | 
 91 |         all_methods_id_dates = dev_miner.get_methods(
 92 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
 93 |             project_id=test_driller.config.ct.project_id,
 94 |             start_date=st_date,
 95 |             end_date=end_date
 96 |         )
 97 |         assert len(all_methods_id_dates) == 5
 98 | 
 99 |         method_updates = dev_miner.get_method_updates(
100 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
101 |         )
102 |         assert len(method_updates) == 9
103 | 
104 |         method_updates_id_dates = dev_miner.get_method_updates(
105 |             dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
106 |             project_id=test_driller.config.ct.project_id,
107 |             start_date=st_date,
108 |             end_date=end_date
109 |         )
110 |         assert len(method_updates_id_dates) == 9
111 | 
112 |         test_driller.clean()
113 | 


--------------------------------------------------------------------------------
/tests/test_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from graphrepo.drillers.driller import Driller
18 | 
19 | 
20 | class TestDriller:
21 |     def test_configure(self):
22 |         folder = os.path.dirname(os.path.abspath(__file__))
23 |         test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
24 | 
25 |         assert test_driller.config.ct.db_url == 'localhost'
26 |         assert test_driller.config.ct.repo == 'tests/gr-test'
27 | 
28 |         assert test_driller.graph is not None
29 | 
30 |     def test_indexing(self):
31 |         folder = os.path.dirname(os.path.abspath(__file__))
32 |         test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
33 |         test_driller.drill_batch()
34 |         records = [r for r in test_driller.graph.run(
35 |             "MATCH(n) RETURN n")]
36 |         assert len(records) == 22
37 | 
38 |         test_driller.clean()
39 | 
40 |     def test_index_save(self):
41 |         folder = os.path.dirname(os.path.abspath(__file__))
42 |         test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
43 |         test_driller.drill_batch(save_path='data/graphrepo.json')
44 |         records = [r for r in test_driller.graph.run(
45 |             "MATCH(n) RETURN n")]
46 |         assert len(records) == 22
47 | 
48 |         test_driller.clean()
49 | 
50 |         test_driller.index_from_file(file_path='data/graphrepo.json')
51 |         records = [r for r in test_driller.graph.run(
52 |             "MATCH(n) RETURN n")]
53 |         assert len(records) == 22
54 | 
55 |         os.remove('data/graphrepo.json')
56 |         test_driller.clean()
57 | 


--------------------------------------------------------------------------------
/tests/test_file.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/test_file.py


--------------------------------------------------------------------------------
/tests/test_file_miner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.file import FileMiner
20 | 
21 | 
22 | class TestFileMiner:
23 |     def test_get_all(self):
24 |         folder = os.path.dirname(os.path.abspath(__file__))
25 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 |         test_driller.drill_batch()
27 | 
28 |         n_matcher = NodeMatcher(test_driller.graph)
29 |         r_matcher = RelationshipMatcher(test_driller.graph)
30 | 
31 |         f_miner = FileMiner(test_driller.graph, n_matcher, r_matcher)
32 | 
33 |         all_files = f_miner.get_all()
34 |         assert len(all_files) == 6
35 | 
36 |         # get readme file
37 |         readme = f_miner.query(name='README.MD')
38 |         assert readme['name'] == 'README.MD'
39 | 
40 |         # get file history
41 |         f_hash = 'f85f4af5b20ddd617f93da13c7789a65fb972e68a8d634d5f253abab'
42 |         update_history = f_miner.get_change_history(f_hash)
43 |         assert len(update_history) == 3
44 | 
45 |         # test file get methods
46 |         current_m = f_miner.get_current_methods(f_hash)
47 |         assert len(current_m) == 2
48 | 
49 |         test_driller.clean()
50 | 


--------------------------------------------------------------------------------
/tests/test_method_miner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.method import MethodMiner
20 | 
21 | 
22 | class TestMethodMiner:
23 |     def test_get_all(self):
24 |         folder = os.path.dirname(os.path.abspath(__file__))
25 |         test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 |         test_driller.drill_batch()
27 | 
28 |         n_matcher = NodeMatcher(test_driller.graph)
29 |         r_matcher = RelationshipMatcher(test_driller.graph)
30 | 
31 |         m_miner = MethodMiner(test_driller.graph, n_matcher, r_matcher)
32 | 
33 |         all_methods = m_miner.get_all()
34 |         assert len(all_methods) == 5
35 |         m_hash = '45ce8dcd8b0cd8ed42e592ce828ab6418e7c79713b8dc99805bcb7ea'
36 |         met = m_miner.query(hash=m_hash)
37 |         assert met['name'] == 'get_name'
38 | 
39 |         history = m_miner.get_change_history(m_hash)
40 |         assert len(history) == 2
41 | 
42 |         test_driller.clean()
43 | 


--------------------------------------------------------------------------------
/tests/test_queue_driller.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | from graphrepo.drillers.queue_driller import QueueDriller
18 | 
19 | 
20 | # class TestQueueDriller:
21 | # def test_indexing(self):
22 | #     folder = os.path.dirname(os.path.abspath(__file__))
23 | #     test_driller = QueueDriller(os.path.join(folder, 'cnfg_init.yml'))
24 | #     test_driller.drill_batch()
25 | #     records = [r for r in test_driller.graph.run(
26 | #         "MATCH(n) RETURN n")]
27 | #     assert len(records) == 22
28 | 
29 | #     test_driller.clean()
30 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 NullConvergence
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from graphrepo.utils import parse_config
17 | 
18 | 
19 | class TestUtils:
20 |     def test_parse_config(self):
21 |         folder = os.path.dirname(os.path.abspath(__file__))
22 |         neo, project = parse_config(os.path.join(folder, 'cnfg_init.yml'))
23 |         assert neo['db_url'] == 'localhost'
24 |         assert neo['db_user'] == 'neo4j'
25 |         assert project['repo'] == 'tests/gr-test'
26 | 


--------------------------------------------------------------------------------