├── .bettercodehub.yml
├── .gitignore
├── .gitmodules
├── .pylintrc
├── LICENSE
├── README.md
├── docs
├── Makefile
├── make.bat
└── source
│ ├── GraphRepoArch.svg
│ ├── GraphRepoArch_old.svg
│ ├── GraphRepoDS.svg
│ ├── GraphRepoSchema.svg
│ ├── _templates
│ └── breadcrumbs.html
│ ├── architecture.rst
│ ├── conf.py
│ ├── configuration.rst
│ ├── css
│ └── custom.css
│ ├── data_structure.rst
│ ├── driller.rst
│ ├── examples.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── mappers.rst
│ └── miners.rst
├── examples
├── __init__.py
├── all_method_complexity.py
├── benchmarks
│ ├── all_data.py
│ ├── all_methods_complexity.py
│ ├── dev_files.py
│ ├── dev_methods.py
│ └── file_nloc.py
├── configs
│ ├── graphrepo.yml
│ ├── grepo-test.yml
│ ├── hadoop.yml
│ ├── jax.yml
│ ├── kibana.yml
│ ├── pydriller.yml
│ └── tensorflow.yml
├── dev_data.py
├── file_complexity.py
├── index_all.py
└── mine_all.py
├── graphrepo
├── __init__.py
├── config.py
├── drillers
│ ├── __init__.py
│ ├── batch_utils.py
│ ├── cache_driller.py
│ ├── db_init.py
│ ├── default.py
│ ├── delete_all.py
│ ├── drill_cache.py
│ ├── driller.py
│ ├── queue_driller.py
│ ├── rabbit_driller.py
│ └── stomp_driller.py
├── logger.py
├── mappers
│ ├── __init__.py
│ ├── csv.py
│ └── default.py
├── miners
│ ├── __init__.py
│ ├── commit.py
│ ├── default.py
│ ├── developer.py
│ ├── file.py
│ ├── method.py
│ ├── mine_manager.py
│ └── utils.py
├── singleton.py
└── utils.py
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
├── cnfg_init.yml
├── cnfg_simple.yml
├── test_cache_driller.py
├── test_commit.py
├── test_commit_miner.py
├── test_csv_mapper.py
├── test_db_init.py
├── test_dev_miner.py
├── test_driller.py
├── test_file.py
├── test_file_miner.py
├── test_method_miner.py
├── test_queue_driller.py
└── test_utils.py
/.bettercodehub.yml:
--------------------------------------------------------------------------------
1 | component_depth: 1
2 | languages:
3 | - python
4 | exclude:
5 | - /examples/.*
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # VSCode
2 | .vscode/
3 |
4 | # repo
5 | repos/
6 | data/
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | pip-wheel-metadata/
31 | share/python-wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | MANIFEST
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .nox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | .python-version
91 |
92 | # celery beat schedule file
93 | celerybeat-schedule
94 |
95 | # SageMath parsed files
96 | *.sage.py
97 |
98 | # Environments
99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 |
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 |
111 | # Rope project settings
112 | .ropeproject
113 |
114 | # mkdocs documentation
115 | /site
116 |
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 |
122 | # Pyre type checker
123 | .pyre/
124 |
125 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/gr-test"]
2 | path = tests/gr-test
3 | url = https://github.com/NullConvergence/gr-test
4 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # Specify a configuration file.
4 | #rcfile=
5 |
6 | # Python code to execute, usually for sys.path manipulation such as
7 | # pygtk.require().
8 | #init-hook=
9 |
10 | # Add files or directories to the blacklist. They should be base names, not
11 | # paths.
12 | ignore=CVS
13 |
14 | # Add files or directories matching the regex patterns to the blacklist. The
15 | # regex matches against base names, not paths.
16 | ignore-patterns=
17 |
18 | # Pickle collected data for later comparisons.
19 | persistent=yes
20 |
21 | # List of plugins (as comma separated values of python modules names) to load,
22 | # usually to register additional checkers.
23 | load-plugins=
24 |
25 | # Use multiple processes to speed up Pylint.
26 | jobs=1
27 |
28 | # Allow loading of arbitrary C extensions. Extensions are imported into the
29 | # active Python interpreter and may run arbitrary code.
30 | unsafe-load-any-extension=no
31 |
32 | # A comma-separated list of package or module names from where C extensions may
33 | # be loaded. Extensions are loading into the active Python interpreter and may
34 | # run arbitrary code
35 | extension-pkg-whitelist=numpy
36 |
37 | # Allow optimization of some AST trees. This will activate a peephole AST
38 | # optimizer, which will apply various small optimizations. For instance, it can
39 | # be used to obtain the result of joining multiple strings with the addition
40 | # operator. Joining a lot of strings can lead to a maximum recursion error in
41 | # Pylint and this flag can prevent that. It has one side effect, the resulting
42 | # AST will be different than the one from reality. This option is deprecated
43 | # and it will be removed in Pylint 2.0.
44 | optimize-ast=no
45 |
46 |
47 | [MESSAGES CONTROL]
48 |
49 | # Only show warnings with the listed confidence levels. Leave empty to show
50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
51 | confidence=
52 |
53 | # Enable the message, report, category or checker with the given id(s). You can
54 | # either give multiple identifier separated by comma (,) or put this option
55 | # multiple time (only on the command line, not in the configuration file where
56 | # it should appear only once). See also the "--disable" option for examples.
57 | #enable=
58 |
59 | # Disable the message, report, category or checker with the given id(s). You
60 | # can either give multiple identifiers separated by comma (,) or put this
61 | # option multiple times (only on the command line, not in the configuration
62 | # file where it should appear only once).You can also use "--disable=all" to
63 | # disable everything first and then reenable specific checks. For example, if
64 | # you want to run only the similarities checker, you can use "--disable=all
65 | # --enable=similarities". If you want to run only the classes checker, but have
66 | # no Warning level messages displayed, use"--disable=all --enable=classes
67 | # --disable=W"
68 | disable=long-suffix,standarderror-builtin,indexing-exception,delslice-method,unichr-builtin,dict-view-method,parameter-unpacking,unicode-builtin,cmp-builtin,intern-builtin,round-builtin,backtick,nonzero-method,xrange-builtin,coerce-method,raw_input-builtin,old-division,filter-builtin-not-iterating,old-octal-literal,input-builtin,map-builtin-not-iterating,buffer-builtin,basestring-builtin,zip-builtin-not-iterating,using-cmp-argument,unpacking-in-except,old-raise-syntax,coerce-builtin,dict-iter-method,hex-method,range-builtin-not-iterating,useless-suppression,cmp-method,print-statement,reduce-builtin,file-builtin,long-builtin,getslice-method,execfile-builtin,no-absolute-import,metaclass-assignment,oct-method,reload-builtin,import-star-module-level,suppressed-message,apply-builtin,raising-string,next-method-called,setslice-method,old-ne-operator,arguments-differ,wildcard-import,locally-disabled
69 |
70 |
71 | [REPORTS]
72 |
73 | # Set the output format. Available formats are text, parseable, colorized, msvs
74 | # (visual studio) and html. You can also give a reporter class, eg
75 | # mypackage.mymodule.MyReporterClass.
76 | output-format=text
77 |
78 | # Put messages in a separate file for each module / package specified on the
79 | # command line instead of printing them on stdout. Reports (if any) will be
80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated
81 | # and it will be removed in Pylint 2.0.
82 | files-output=no
83 |
84 | # Tells whether to display a full report or only the messages
85 | reports=yes
86 |
87 | # Python expression which should return a note less than 10 (10 is the highest
88 | # note). You have access to the variables errors warning, statement which
89 | # respectively contain the number of errors / warnings messages and the total
90 | # number of statements analyzed. This is used by the global evaluation report
91 | # (RP0004).
92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
93 |
94 | # Template used to display messages. This is a python new-style format string
95 | # used to format the message information. See doc for all details
96 | #msg-template=
97 |
98 |
99 | [BASIC]
100 |
101 | # Good variable names which should always be accepted, separated by a comma
102 | good-names=i,j,k,ex,Run,_
103 |
104 | # Bad variable names which should always be refused, separated by a comma
105 | bad-names=foo,bar,baz,toto,tutu,tata
106 |
107 | # Colon-delimited sets of names that determine each other's naming style when
108 | # the name regexes allow several styles.
109 | name-group=
110 |
111 | # Include a hint for the correct naming format with invalid-name
112 | include-naming-hint=no
113 |
114 | # List of decorators that produce properties, such as abc.abstractproperty. Add
115 | # to this list to register other decorators that produce valid properties.
116 | property-classes=abc.abstractproperty
117 |
118 | # Regular expression matching correct variable names
119 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
120 |
121 | # Naming hint for variable names
122 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
123 |
124 | # Regular expression matching correct class attribute names
125 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
126 |
127 | # Naming hint for class attribute names
128 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
129 |
130 | # Regular expression matching correct argument names
131 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
132 |
133 | # Naming hint for argument names
134 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
135 |
136 | # Regular expression matching correct module names
137 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
138 |
139 | # Naming hint for module names
140 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
141 |
142 | # Regular expression matching correct constant names
143 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
144 |
145 | # Naming hint for constant names
146 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
147 |
148 | # Regular expression matching correct inline iteration names
149 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
150 |
151 | # Naming hint for inline iteration names
152 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
153 |
154 | # Regular expression matching correct method names
155 | method-rgx=[a-z_][a-z0-9_]{2,30}$
156 |
157 | # Naming hint for method names
158 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
159 |
160 | # Regular expression matching correct function names
161 | function-rgx=[a-z_][a-z0-9_]{2,30}$
162 |
163 | # Naming hint for function names
164 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
165 |
166 | # Regular expression matching correct attribute names
167 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
168 |
169 | # Naming hint for attribute names
170 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
171 |
172 | # Regular expression matching correct class names
173 | class-rgx=[A-Z_][a-zA-Z0-9]+$
174 |
175 | # Naming hint for class names
176 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
177 |
178 | # Regular expression which should only match function or class names that do
179 | # not require a docstring.
180 | no-docstring-rgx=^test_
181 |
182 | # Minimum line length for functions/classes that require docstrings, shorter
183 | # ones are exempt.
184 | docstring-min-length=-1
185 |
186 |
187 | [ELIF]
188 |
189 | # Maximum number of nested blocks for function / method body
190 | max-nested-blocks=5
191 |
192 |
193 | [FORMAT]
194 |
195 | # Maximum number of characters on a single line.
196 | max-line-length=80
197 |
198 | # Regexp for a line that is allowed to be longer than the limit.
199 | ignore-long-lines=^\s*(# )??$
200 |
201 | # Allow the body of an if to be on the same line as the test if there is no
202 | # else.
203 | single-line-if-stmt=y
204 |
205 | # List of optional constructs for which whitespace checking is disabled. `dict-
206 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
207 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
208 | # `empty-line` allows space-only lines.
209 | no-space-check=trailing-comma,dict-separator
210 |
211 | # Maximum number of lines in a module
212 | max-module-lines=1000
213 |
214 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
215 | # tab).
216 | indent-string=' '
217 |
218 | # Number of spaces of indent required inside a hanging or continued line.
219 | indent-after-paren=4
220 |
221 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
222 | expected-line-ending-format=
223 |
224 |
225 | [LOGGING]
226 |
227 | # Logging modules to check that the string format arguments are in logging
228 | # function parameter format
229 | logging-modules=logging
230 |
231 |
232 | [MISCELLANEOUS]
233 |
234 | # List of note tags to take in consideration, separated by a comma.
235 | notes=FIXME,XXX,TODO
236 |
237 |
238 | [SIMILARITIES]
239 |
240 | # Minimum lines number of a similarity.
241 | min-similarity-lines=10
242 |
243 | # Ignore comments when computing similarities.
244 | ignore-comments=yes
245 |
246 | # Ignore docstrings when computing similarities.
247 | ignore-docstrings=yes
248 |
249 | # Ignore imports when computing similarities.
250 | ignore-imports=no
251 |
252 |
253 | [SPELLING]
254 |
255 | # Spelling dictionary name. Available dictionaries: none. To make it working
256 | # install python-enchant package.
257 | spelling-dict=
258 |
259 | # List of comma separated words that should not be checked.
260 | spelling-ignore-words=
261 |
262 | # A path to a file that contains private dictionary; one word per line.
263 | spelling-private-dict-file=
264 |
265 | # Tells whether to store unknown words to indicated private dictionary in
266 | # --spelling-private-dict-file option instead of raising a message.
267 | spelling-store-unknown-words=no
268 |
269 |
270 | [TYPECHECK]
271 |
272 | # Tells whether missing members accessed in mixin class should be ignored. A
273 | # mixin class is detected if its name ends with "mixin" (case insensitive).
274 | ignore-mixin-members=yes
275 |
276 | # List of module names for which member attributes should not be checked
277 | # (useful for modules/projects where namespaces are manipulated during runtime
278 | # and thus existing member attributes cannot be deduced by static analysis. It
279 | # supports qualified module names, as well as Unix pattern matching.
280 | ignored-modules=
281 |
282 | # List of class names for which member attributes should not be checked (useful
283 | # for classes with dynamically set attributes). This supports the use of
284 | # qualified names.
285 | ignored-classes=optparse.Values,thread._local,_thread._local,matplotlib.cm,tensorflow.python,tensorflow,tensorflow.train.Example,RunOptions
286 |
287 | # List of members which are set dynamically and missed by pylint inference
288 | # system, and so shouldn't trigger E1101 when accessed. Python regular
289 | # expressions are accepted.
290 | generated-members=set_shape,np.float32
291 |
292 | # List of decorators that produce context managers, such as
293 | # contextlib.contextmanager. Add to this list to register other decorators that
294 | # produce valid context managers.
295 | contextmanager-decorators=contextlib.contextmanager
296 |
297 |
298 | [VARIABLES]
299 |
300 | # Tells whether we should check for unused import in __init__ files.
301 | init-import=no
302 |
303 | # A regular expression matching the name of dummy variables (i.e. expectedly
304 | # not used).
305 | dummy-variables-rgx=(_+[a-zA-Z0-9_]*?$)|dummy
306 |
307 | # List of additional names supposed to be defined in builtins. Remember that
308 | # you should avoid to define new builtins when possible.
309 | additional-builtins=
310 |
311 | # List of strings which can identify a callback function by name. A callback
312 | # name must start or end with one of those strings.
313 | callbacks=cb_,_cb
314 |
315 | # List of qualified module names which can have objects that can redefine
316 | # builtins.
317 | redefining-builtins-modules=six.moves,future.builtins
318 |
319 |
320 | [CLASSES]
321 |
322 | # List of method names used to declare (i.e. assign) instance attributes.
323 | defining-attr-methods=__init__,__new__,setUp
324 |
325 | # List of valid names for the first argument in a class method.
326 | valid-classmethod-first-arg=cls
327 |
328 | # List of valid names for the first argument in a metaclass class method.
329 | valid-metaclass-classmethod-first-arg=mcs
330 |
331 | # List of member names, which should be excluded from the protected access
332 | # warning.
333 | exclude-protected=_asdict,_fields,_replace,_source,_make
334 |
335 |
336 | [DESIGN]
337 |
338 | # Maximum number of arguments for function / method
339 | max-args=10
340 |
341 | # Argument names that match this expression will be ignored. Default to name
342 | # with leading underscore
343 | ignored-argument-names=_.*
344 |
345 | # Maximum number of locals for function / method body
346 | max-locals=30
347 |
348 | # Maximum number of return / yield for function / method body
349 | max-returns=6
350 |
351 | # Maximum number of branch for function / method body
352 | max-branches=12
353 |
354 | # Maximum number of statements in function / method body
355 | max-statements=100
356 |
357 | # Maximum number of parents for a class (see R0901).
358 | max-parents=7
359 |
360 | # Maximum number of attributes for a class (see R0902).
361 | max-attributes=10
362 |
363 | # Minimum number of public methods for a class (see R0903).
364 | min-public-methods=0
365 |
366 | # Maximum number of public methods for a class (see R0904).
367 | max-public-methods=20
368 |
369 | # Maximum number of boolean expressions in a if statement
370 | max-bool-expr=5
371 |
372 |
373 | [IMPORTS]
374 |
375 | # Deprecated modules which should not be used, separated by a comma
376 | deprecated-modules=optparse
377 |
378 | # Create a graph of every (i.e. internal and external) dependencies in the
379 | # given file (report RP0402 must not be disabled)
380 | import-graph=
381 |
382 | # Create a graph of external dependencies in the given file (report RP0402 must
383 | # not be disabled)
384 | ext-import-graph=
385 |
386 | # Create a graph of internal dependencies in the given file (report RP0402 must
387 | # not be disabled)
388 | int-import-graph=
389 |
390 | # Force import order to recognize a module as part of the standard
391 | # compatibility libraries.
392 | known-standard-library=
393 |
394 | # Force import order to recognize a module as part of a third party library.
395 | known-third-party=enchant
396 |
397 | # Analyse import fallback blocks. This can be used to support both Python 2 and
398 | # 3 compatible code, which means that the block might have code that exists
399 | # only in one or another interpreter, leading to false positives when analysed.
400 | analyse-fallback-blocks=no
401 |
402 |
403 | [EXCEPTIONS]
404 |
405 | # Exceptions that will emit a warning when being caught. Defaults to
406 | # "Exception"
407 | overgeneral-exceptions=Exception
408 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GraphRepo  [](https://bettercodehub.com/)
2 |
3 | GraphRepo is a tool for mining software repositories in real time. It indexes Git repositories in Neo4j and implements multiple queries to select and process the repository data.
4 |
5 | For a complete description, see the [online documentation](https://graphrepo.readthedocs.io/en/latest/).
6 |
7 |
8 |
9 |
10 |
x
11 |
12 | ### 1. Installation & First run
13 |
14 | #### 1.1 Prereq
15 | The only requirement is to have Python >=3.5 and Docker installed on your system.
16 |
17 | #### 1.2 Install using pip
18 |
19 | The production release can be installed using pip:
20 |
21 | ```
22 | $ pip install graphrepo
23 | ```
24 |
25 |
36 |
37 |
38 | #### 1.3 Run and configure Neo4j
39 |
40 | The following instructions assume the Docker daemon is running on your machine:
41 |
42 | ```
43 | $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins -e NEO4JLABS_PLUGINS=\[\"apoc\"\] -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11
44 | ```
45 |
46 | Open a browser window and go to [http://localhost:7474](http://localhost:7474). Here you can configure the neo4j password.
47 | The default one is *neo4jj*.
48 |
49 | ##### Optionally, configure Neo4j to allow larger heap size using the following attributes with the command above:
50 |
51 | ```
52 | --env NEO4J_dbms_memory_pagecache_size=4g
53 | --env NEO4J_dbms_memory_heap_max__size=4g
54 | ```
55 |
56 | #### 1.4. Index and vizualize a repo
57 |
58 | In order to index a repository, you must clone it on localhost, and point GraphRepo to it. For example:
59 | ```
60 | $ mkdir repos
61 | $ cd repos
62 | $ git clone https://github.com/ishepard/pydriller
63 | ```
64 |
65 | Now enter the [examples](/examples) folder from this repository, and edit the configuration file for PyDriller to reflect the database URL and desired batch size:
66 | ```
67 | $ cd ../examples/
68 | $ nano configs/pydriller.yml
69 | ```
70 |
71 | Afterwards, we can run the script from the examples folder which indexes the repository in Neo4j:
72 |
73 | ```
74 | $ python -m examples.index_all --config=examples/configs/pydriller.yml
75 | ```
76 |
77 | Go to [http://localhost:7474](http://localhost:7474) and use the query from 3.1
78 |
79 |
80 | #### 1.5. Retrieve all data from Neo4j using GraphRepo
81 |
82 | Assuming you succeded in step 1.4, use the follwing command to retrieve all indexed data:
83 |
84 | ```
85 | $ python -m examples.mine_all --config=examples/configs/pydriller.yml
86 | ```
87 |
88 |
89 | ### 2. Examples
90 |
91 | For a comprehensive introduction and more examples, see the [documentation](https://graphrepo.readthedocs.io/en/latest/examples.html).
92 |
93 |
94 |
95 | ### 3. Useful Neo4j queries for the web interface
96 |
97 | #### 3.1 Match all nodes in a graph
98 | ```
99 | MATCH (n) RETURN n
100 | ```
101 |
102 |
103 | #### 3.2 Delete all nodes and relationships in a graph
104 |
105 | ```
106 | MATCH (n) DETACH DELETE n;
107 | ```
108 |
109 | #### 3.2 Delete a limited number commits and relationship
110 |
111 | ```
112 | MATCH (n:Commit)
113 | // Take the first 100 commits nodes and their rels
114 | WITH n LIMIT 100
115 | DETACH DELETE n
116 | RETURN count(*);
117 | ```
118 |
119 |
120 |
121 | This project is enabled by [Pydriller](https://github.com/ishepard/pydriller).
122 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/GraphRepoDS.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/docs/source/GraphRepoSchema.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html:
--------------------------------------------------------------------------------
1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %}
2 |
3 | {% block breadcrumbs_aside %}
4 | {% endblock %}
--------------------------------------------------------------------------------
/docs/source/architecture.rst:
--------------------------------------------------------------------------------
1 | .. _architecture_toplevel:
2 |
3 | ==================
4 | Architecture
5 | ==================
6 |
7 | GraphRepo consists of 3 main components:
8 |
9 | * :ref:`DRILLERS` - components used to parse data from a git repository and insert records in Neo4j,
10 | * :ref:`MINERS` and MinerManager - components which hold default queries and interfaces for retrieving data from Neo4j, and
11 | * :ref:`MAPPERS` - components used to transform the data retrieved by Miners in specific format, filter or sort data.
12 |
13 | The advantage of using custom mappers is that the load on Neo4j can be decreased,
14 | using lighter queries to extract the data and more intensive data processing in the
15 | custom mappers. For example, one can write a mapper using PySpark on raw data extracted
16 | from Neo4j and use the Apache Spark engine for scalability.
17 |
18 | .. image:: /GraphRepoArch.svg
19 | :width: 400
20 | :align: center
21 |
22 |
23 | Specific information about each component can be found using the links above.
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 | import os
18 | import sys
19 | sys.path.insert(0, os.path.abspath('../'))
20 |
21 |
22 | # -- Project information -----------------------------------------------------
23 |
24 | project = 'GraphRepo'
25 | copyright = '2021, GraphRepo'
26 | author = 'GraphRepo'
27 |
28 | # The full version, including alpha/beta/rc tags
29 | version = ''
30 | release = '1.0.0'
31 |
32 |
33 | # -- General configuration ---------------------------------------------------
34 |
35 | master_doc = 'index'
36 |
37 | # Add any Sphinx extension module names here, as strings. They can be
38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
39 | # ones.
40 | extensions = ['sphinx.ext.autodoc',
41 | 'sphinx.ext.doctest']
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 |
46 | # List of patterns, relative to source directory, that match files and
47 | # directories to ignore when looking for source files.
48 | # This pattern also affects html_static_path and html_extra_path.
49 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
50 |
51 |
52 | # -- Options for HTML output -------------------------------------------------
53 |
54 | # The theme to use for HTML and HTML Help pages. See the documentation for
55 | # a list of builtin themes.
56 | #
57 | html_theme = 'sphinx_rtd_theme'
58 |
59 | # Add any paths that contain custom static files (such as style sheets) here,
60 | # relative to this directory. They are copied after the builtin static files,
61 | # so a file named "default.css" will overwrite the builtin "default.css".
62 | html_static_path = ['_static']
63 |
64 |
65 | html_css_files = [
66 | 'css/custom.css',
67 | ]
--------------------------------------------------------------------------------
/docs/source/configuration.rst:
--------------------------------------------------------------------------------
1 | .. _CONFIGURATION:
2 |
3 | ==================
4 | Configuration
5 | ==================
6 |
7 | For any activity, GraphRepo uses a yaml (.yml) configuration with 2 objects:
8 |
9 | * a Neo4j instance configuration, and
10 | * a repository configuration,
11 |
12 | as follows::
13 |
14 | neo:
15 | db_url: localhost # the url for the Neo4j database
16 | port: 7687 # the Neo4j port
17 | db_user: neo4j # Neo4j authentication username
18 | db_pwd: neo4jj # Neo4j authentication password
19 | batch_size: 100 # the batch size for inserting the records in Neo4j - this setting depends on the Neo4j resources
20 |
21 | project:
22 | repo: "repos/graphrepo/" # the repository filepath
23 | start_date: "1 February, 2018" # the start date for indexing (leave empty if it corresponds with the initial start date of the project)
24 | end_date: "30 March, 2018" # the start date for indexing (leave empty if it corresponds with the last commit)
25 | project_id: "graphrepo" # a unique project id for the database
26 | index_code: False # boolean, if True GraphRepo indexes for each file touched by a commit the source code before and after the commit. This parameter significantly increases the index time and the hardware resources needed for Neo4j. For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True.
27 | index_developer_email: True # boolean, if True, GraphRepo indexes the developer emails in the Developer node. Turn flag off for GDPR or any other privacy concerns
28 |
29 |
30 |
31 | Neo4j configuration
32 | ====================
33 |
34 | GraphRepo connects to Neo4j using the Bold REST API from `py2neo `_.
35 | Currently the only attributes needed to connect to Neo4j are the url+port and the authentication credentials.
36 | All other configurations (e.g., setting the user permissions) are done on the database side.
37 |
38 |
39 | Repository configuration
40 | ========================
41 |
42 | In order to insert a repository in the database, it has to be cloned on the local machine (where GraphRepo will run).
43 | Afterwards, it can be linked with GraphRepo using the ``project.repo`` attribute in the config file.
44 |
45 | If one does not want to use all the repository data (e.g., if the repository is very large), it can configure
46 | the index dates using the ``project.start_date`` and ``project.end_date`` attributes.
47 |
48 | The ``project.project_id`` attribute is used to give each project a unique identifier.
49 | Currently, GraphRepo indexes all repositories in the same database, in order to allow information about teams of developers that work
50 | on distinct projects to be mined without merging databases.
51 |
52 |
53 | The ``project.index_code`` attribute decides if GraphRepo indexes, for each file touched by a commit, the source code before and after the commit.
54 | This parameter significantly increases the index time and the hardware resources needed for Neo4j.
55 | For a medium size project, with 4000 commits, with an average of 1 file edited/commit, the equivalent of 8000 files will be stored in text in Neo4j if this parameter is set to True.
56 |
57 |
58 | For examples of config files, see the projects repository, ``examples/configs/pydriller.yml``.
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/docs/source/css/custom.css:
--------------------------------------------------------------------------------
1 | /* Hide "On GitHub" section from versions menu */
2 | div.rst-versions>div.rst-other-versions>div.injected>dl:nth-child(4) {
3 | display: none;
4 | }
5 |
6 | .wy-breadcrumbs-aside {
7 |
8 | display: none;
9 | }
--------------------------------------------------------------------------------
/docs/source/data_structure.rst:
--------------------------------------------------------------------------------
1 | .. _DS:
2 |
3 | ==================
4 | Schema
5 | ==================
6 |
7 | The resulting Neo4j schema consists of 5 node types and 6 relationship types, as illustrated below:
8 |
9 | .. figure:: /GraphRepoSchema.svg
10 | :width: 45 %
11 | :align: center
12 |
13 | Nodes
14 | ===========
15 |
16 |
17 | Branch
18 | -----------
19 |
20 | Each branch identified by PyDriller is indexed as a node with the following attributes::
21 |
22 | {
23 | "hash": "string - unique identifier",
24 | "project_id": "string - project id from config (can be used to select all branches from a project)",
25 | "name": "string - branch name",
26 | }
27 |
28 | Commit
29 | -----------
30 |
31 | Each commit is indexed as a node with the following attributes::
32 |
33 | {
34 | "hash": "string - unique identifier in Neo4j",
35 | "commit_hash": "string - commit hash in git",
36 | "message": "string - commit message in git",
37 | "is_merge": "int - 1 if the commit is merge, 0 otherwise",
38 | "timestamp": "int - Unix epoch, time of the commit",
39 | "project_id": "string - project id from config (can be used to select all branches from a project)",
40 | "dmm_unit_complexity": "int, see Pydriller",
41 | "dmm_unit_interfacing": "int, see Pydriller",
42 | "dmm_unit_size": "int, see Pydriller"
43 | }
44 |
45 |
46 |
47 | Developer
48 | -----------
49 |
50 | Each developer is indexed as a node with the following attributes::
51 |
52 | {
53 | "hash": "string - unique identifier",
54 | "name": "string - developer name as in git",
55 | "email": "string - developer email as in git",
56 | }
57 |
58 | Currently the mail and email information is not anonymized.
59 |
60 | File
61 | -----------
62 |
63 |
64 | Each file is indexed as a node with the following attributes::
65 |
66 | {
67 | "hash": "string - unique identifier",
68 | "name": "string - file short name as in git",
69 | "project_id": "string - project id from config (can be used to select all branches from a project)",
70 | "type": "string - file extension, e.g., '.py'"
71 | }
72 |
73 |
74 |
75 | Method
76 | -----------
77 |
78 | Each method is indexed as a node with the following attributes::
79 |
80 | {
81 | "hash": "string - unique identifier",
82 | "name": "string - method name as in file",
83 | "file_name": "string - parent file name",
84 | "project_id": "string - project id from config (can be used to select all branches from a project)",
85 | "type": "string - file extension, e.g., '.py'"
86 | }
87 |
88 |
89 |
90 | Relationships
91 | ===============
92 |
93 | Author
94 | -----------
95 |
96 | An Author relationship exists between each commit and its author.
97 | The direction is from Commit to Author and the relationship attributes are::
98 |
99 | {
100 | "timestamp": "int - Unix epoch, time of the commit"
101 | }
102 |
103 |
104 | BranchCommit
105 | --------------
106 | A BranchCommit relationship exists between each branch and the branch commits.
107 | The direction is from Branch to Commit. This relationship does not have any special attributes.
108 |
109 |
110 | Method
111 | -----------
112 |
113 | An Method relationship exists between each file and its methods.
114 | The direction is from File to Method. This relationship does not have any special attributes.
115 | In order to find out if the method is still part of the file or it was deleted, we can use the FileMiner.
116 |
117 |
118 | Parent
119 | -----------
120 | A parent relationship exists between each commit its parent/parents.
121 | This relationship does not have any special attributes.
122 |
123 |
124 | UpdateFile
125 | -----------
126 |
127 | An UpdateFile relationship exists between a commit that edited a file and the edited file.
128 | The direction is from Commit to File and the relationship attributes are::
129 |
130 | {
131 | "timestamp": "int - Unix epoch, time of the commit",
132 | "old_path": "string - old path, if the file was moved (see type attribute)",
133 | "path": "string - current file path",
134 | "diff": "string - commit diff",
135 | "source_code": "string - source code after the commit",
136 | "source_code_before": "string - source before after the commit",
137 | "nloc": "int - file lines of code after the commit",
138 | "complexity": "int - file complexity after the commit",
139 | "token_count": "int - number of tokens after the commit",
140 | "added": "int - number of lines added in commit",
141 | "removed": "int - number of lines removed in commit",
142 | "type": "string - type of update. Possible values are: 'ADD', 'COPY', 'RENAME', 'DELETE', 'MODIFY', 'UNKNOWN' "
143 | }
144 |
145 |
146 | UpdateMethod
147 | -------------
148 |
149 | An UpdateMethod relationship exists between a commit that edited a method and the edited method.
150 | The direction is from Commit to Method and the relationship attributes are::
151 |
152 | {
153 | "timestamp": "int - Unix epoch, time of the commit",
154 | "long_name": "string - method long name, including parameters",
155 | "parameters": "string - method parameters",
156 | "complexity": "int - method complexity, after commit",
157 | "nloc": "int - method lines of code, after commit",
158 | "fan_in": "int - method fan in, after commit",
159 | "fan_out": "int - method fan out, after commit",
160 | "general_fan_out": "int -method general fan out, after commit",
161 | "length": "int -method general fan out, after commit",
162 | "token_count": "int -method nr of tokens, after commit",
163 | "start_line": "int -method start line, after commit",
164 | "end_line": "int -method end line, after commit",
165 | }
166 |
--------------------------------------------------------------------------------
/docs/source/driller.rst:
--------------------------------------------------------------------------------
1 | .. _DRILLERS:
2 |
3 | ==================
4 | Drillers
5 | ==================
6 |
7 | All Drillers parse a repository and insert it in Neo4j.
8 | Under the hood all drillers uses PyDriller to extract data from a repository.
9 |
10 | Drillers perform the following activities.
11 | Given a config file, they:
12 |
13 | * establish a connection to Neo4j (or raise an exception if the connection fails),
14 | * parse the data from PyDriller,
15 | * insert the data in Neo4j.
16 |
17 |
18 | Currently there are 3 drillers available:
19 |
20 | * Driller - default driller that stores the data parsed from the repository in RAM memory.
21 | * CacheDriller - stores the data parsed from the repository on disk (thus saving RAM memory at the cost of more disk writes and decreased performance).
22 | * QueueDriller - stores the data parsed from a repository to a queue. Currently it supports RabbitMQ and Artemis. Please take note that two drillers must be used in case of a queue: (i) one that parses the data from Git repos and (ii) one that indexes the data in Neo4j.
23 | The queue driller is the most scalable one since it allows to have multiple instances for indexing. Thus it solves some scalability issues (e.g., PyDriller is single threaded).
24 |
25 | In order to index the data, you will need a config file (see :ref:`CONFIGURATION`) and the
26 | following code::
27 |
28 | from graphrepo.drillers.drillers import Driller
29 |
30 | # Initialize the database indexes
31 | try:
32 | driller.init_db()
33 | except Exception as exc:
34 | print("DB already initialized")
35 |
36 | # configure driller
37 | driller = Driller(config_path='path-to-yaml-config-file.yml')
38 |
39 | # drill (extract data and store it in Neo4j)
40 | driller.drill_batch()
41 |
42 | # merge duplicate nodes
43 | driller.merge_all()
44 |
45 |
46 | For a complete example, see :ref:`EXAMPLES`.
47 |
48 |
--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
1 | .. _EXAMPLES:
2 |
3 | ==================
4 | Examples
5 | ==================
6 |
7 | In the project's repository there are many examples on how to
8 | use GraphRepo to index and mine data.
9 |
10 | Please note that in order to run the plotting examples you have to install ``pandas`` and ``plotly``, for example using pip::
11 |
12 | $ pip install pandas
13 |
14 | 1. Index data
15 | ==============
16 |
17 | In this example, we index all data from PyDriller in Neo4j.
18 | The example assumes you are running a Neo4j instance in Docker, as indicated in :ref:`CONFIGURATION`.
19 |
20 | In order to run the example, clone the projects using the following commands::
21 |
22 | $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo
23 | $ cd graphrepo
24 | $ mkdir repos
25 | $ cd repos
26 | $ git clone https://github.com/ishepard/pydriller
27 |
28 | In this step we cloned the GraphRepo project, which includes the example scripts to run
29 | and the PyDriller project, which we want to experiment with.
30 |
31 | In order to run the indexing example, make sure to configure the config file in ``examples/configs/pydriller.yml``
32 | and set the ``neo`` object to your database settings.
33 |
34 | Then run::
35 |
36 | $ python -m examples.index_all --config=examples/config/pydriller.yml
37 |
38 | After indexing finishes, you can go to ``http://:7474/browser/``
39 | and explore the project, with a query like: ``MATCH (n) RETURN n``.
40 |
41 |
42 | 2. Retrieve all data
43 | =====================
44 |
45 | This step assumes you already indexed the PyDriller repository
46 | in Neo4j, as indicated at Step 1.
47 | In order to retrieve all information for PyDriller, we can run
48 | the following example::
49 |
50 | $ python -m examples.mine_all --config=examples/config/pydriller.yml
51 |
52 | This script will print the number of nodes indexed in the database.
53 |
54 |
55 | 3. Plot file complexity over time
56 | ===================================
57 |
58 | This step assumes you already indexed the PyDriller repository
59 | in Neo4j, as indicated at Step 1.
60 | In this example we will use the miners to retrieve a file and
61 | plot its complexity evolution over time.
62 | The file used is ``examples/file_complexity.py``.
63 | The complexity is stored in the ``UpdateFile`` relationship (see Schema).
64 | The ``get_change_history`` from the ``File`` miner retrieves all the ``UpdateFile``
65 | relationships that point to the file.
66 |
67 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly,
68 | although any other libraries can be used.
69 |
70 | In order to display the plot, run::
71 |
72 | $ python -m examples.file_complexity --config=examples/configs/pydriller.yml
73 |
74 |
75 |
76 |
77 | 3. Plot file methods complexity over time
78 | ==========================================
79 |
80 | This step assumes you already indexed the PyDriller repository
81 | in Neo4j, as indicated at Step 1.
82 | In this example we will use the miners to retrieve and plot the complexity
83 | evolution over time of all methods in a file.
84 | The file used is ``examples/all_method_complexity.py``.
85 | The complexity is stored in the ``UpdateFile`` relationship (see Data Structure).
86 | We first get all the methods for a file, then, for each method, we get the
87 | update information as in Step 2.
88 |
89 | For plotting, in the example we map the data to a pandas DataFrame and use Plotly,
90 | although any other libraries can be used.
91 |
92 | In order to display the plot, run::
93 |
94 | $ python -m examples.all_method_complexity --config=examples/configs/pydriller.yml
95 |
96 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. GraphRepo documentation master file, created by
2 | sphinx-quickstart on Wed Jun 3 13:16:41 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | GraphRepo documentation
7 | =====================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 |
12 | installation
13 | configuration
14 | architecture
15 | data_structure
16 | driller
17 | miners
18 | examples
19 |
20 |
--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | .. _INSTALLATION:
2 |
3 | ========================
4 | Overview & Installation
5 | ========================
6 |
7 | GraphRepo is a tool that indexes Git repositories in Neo4j, and allows to query and aggregate the data.
8 | Under the hood it uses `PyDriller `_ to parse the data from a repository.
9 |
10 | Requirements
11 | ============
12 |
13 | * Python 3.4 (or newer)
14 | * Neo4j 3
15 | * Docker (Optional) - we recommend to use Docker for Neo4j (as indicated below)
16 |
17 | Installation - using pip
18 | =========================
19 |
20 | Assuming python and pip are installed, use:
21 |
22 | .. sourcecode:: none
23 |
24 | $ pip install graphrepo
25 |
26 |
27 | Installation - clone source code (dev version)
28 | ===============================================
29 |
30 | The latest development version can be cloned from Github::
31 |
32 | $ git clone --recurse-submodules https://github.com/NullConvergence/GraphRepo
33 | $ cd graphrepo
34 |
35 |
36 | Install the requirements:
37 |
38 | .. sourcecode:: none
39 |
40 | $ pip install -r requirements.txt
41 |
42 | Run a docker instance with Neo4j::
43 |
44 | $ docker run -p 7474:7474 -p 7687:7687 -v $HOME/neo4j/data:/data -v $HOME/neo4j/plugins:/plugins -e NEO4JLABS_PLUGINS=\[\"apoc\"\] -e NEO4J_AUTH=neo4j/neo4jj neo4j:3.5.11
45 |
46 | Run the tests::
47 |
48 | $ pytest
49 |
50 |
51 | Or see the :ref:`EXAMPLES`.
--------------------------------------------------------------------------------
/docs/source/mappers.rst:
--------------------------------------------------------------------------------
1 | .. _MAPPERS:
2 |
3 | ==================
4 | Mappers
5 | ==================
6 |
7 |
--------------------------------------------------------------------------------
/docs/source/miners.rst:
--------------------------------------------------------------------------------
1 | .. _MINERS:
2 |
3 | ==================
4 | Miners
5 | ==================
6 |
7 | Miners are special classes which hold default Neo4j queries that can be used to extract data.
8 | At the moment, there are 4 standard miners, specific to the most important node entities in the graph:
9 |
10 | * ``CommitMiner`` - default queries for commits (including relationships to other nodes),
11 | * ``DeveloperMiner`` - default queries for developers (including relationships to other nodes),
12 | * ``FileMiner`` - default queries for files (including relationships to other nodes),
13 | * ``MethodMiner`` - default queries for methods (including relationships to other nodes),
14 |
15 | and a ``MineManager``, which initializes and configures all miners.
16 |
17 | We recommend to always use the ``MineManager`` for initialization, since there is no overhead over initializing only one miner.
18 | Using a config file (see :ref:`CONFIGURATION`), the ``Minemanager`` can be initialized as follows::
19 |
20 | from graphrepo.miners import MineManager
21 |
22 | # initialize mine manager
23 | miner = MineManager(config_path=args.config)
24 |
25 | # The specific miners can now be accessed as:
26 | miner.commit_miner.get_all()
27 |
28 | miner.dev_miner.get_all()
29 |
30 | miner.file_miner.get_all()
31 |
32 | miner.method_miner.get_all()
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/examples/__init__.py
--------------------------------------------------------------------------------
/examples/all_method_complexity.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module plots the method complexity evolution over time, for a file"""
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import pandas as pd
21 | import plotly.express as px
22 |
23 | from graphrepo.miners import MineManager
24 |
25 | from datetime import datetime
26 |
27 |
28 | def parse_args():
29 | """Parse args"""
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument(
32 | '--config', default='examples/configs/pydriller.yml', type=str)
33 | parser.add_argument('--plot', default=False, type=bool)
34 | return parser.parse_args()
35 |
36 |
37 | def main():
38 | """Main"""
39 | args = parse_args()
40 |
41 | file_query = {
42 | 'hash': 'e2eb7bf414cebe68f46fa88e4abe9ae5813e91c4e1e97570f8e41cf4'}
43 |
44 | start = datetime.now()
45 | mine_manager = MineManager(config_path=args.config)
46 |
47 | methods = mine_manager.file_miner.get_current_methods(file_query['hash'])
48 |
49 | m_changes = []
50 | for m in methods:
51 | changes = mine_manager.method_miner.get_change_history(m)
52 | mc = [{'complexity': x['complexity'],
53 | 'date': datetime.fromtimestamp(x['timestamp']),
54 | 'name': m['name']} for x in changes]
55 | m_changes = m_changes + mc
56 | print('All methods complexity took: {}'.format(datetime.now() - start))
57 | print('Total methods: ', len(methods))
58 |
59 | if args.plot:
60 | df = pd.DataFrame(m_changes)
61 | df['date'] = pd.to_datetime(df.date)
62 | df = df.sort_values(by='date')
63 | fig = px.line(df, x="date", y="complexity", color="name",
64 | line_group="name", hover_name="name")
65 | fig.show()
66 |
67 |
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/examples/benchmarks/all_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import os
17 | import yaml
18 | from graphrepo.miners import MineManager
19 | from datetime import datetime
20 |
21 |
22 | def parse_args():
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
25 | return parser.parse_args()
26 |
27 |
28 | def main():
29 | args = parse_args()
30 |
31 | start = datetime.now()
32 | miner = MineManager(config_path=args.config)
33 |
34 | # get all nodes and relationships from the manager
35 | nodes, rels = miner.get_all_data(map=False, merge=False)
36 | print("The DB has a total of {} nodes and {} relationships".format(
37 | len(nodes), len(rels)))
38 | print("All data took: {}".format(datetime.now() - start))
39 |
40 |
41 | if __name__ == '__main__':
42 | main()
43 |
--------------------------------------------------------------------------------
/examples/benchmarks/all_methods_complexity.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 |
24 | from graphrepo.miners import MineManager
25 | from graphrepo.utils import parse_config
26 |
27 | from datetime import datetime
28 |
29 |
30 | def parse_args():
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
33 | parser.add_argument('--plot', default=False, type=bool)
34 | return parser.parse_args()
35 |
36 |
37 | def main():
38 | args = parse_args()
39 |
40 | if 'jax' in args.config:
41 | file_query = {
42 | 'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'}
43 | if 'hadoop' in args.config:
44 | file_query = {
45 | 'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'}
46 | if 'kibana' in args.config:
47 | file_query = {
48 | 'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48'
49 | }
50 | if 'tensorflow' in args.config:
51 | file_query = {
52 | 'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73'
53 | }
54 |
55 | start = datetime.now()
56 | mine_manager = MineManager(config_path=args.config)
57 | methods = mine_manager.file_miner.get_current_methods(file_query['hash'])
58 |
59 | m_changes = []
60 | for m in methods:
61 | changes = mine_manager.method_miner.get_change_history(m['hash'])
62 | mc = [{'complexity': x['complexity'],
63 | 'date': datetime.fromtimestamp(x['timestamp']),
64 | 'name': m['name']} for x in changes]
65 | m_changes = m_changes + mc
66 |
67 | print('All methods complexity took: {}'.format(datetime.now() - start))
68 | print('Total methods: ', len(methods))
69 |
70 |
71 | if __name__ == '__main__':
72 | main()
73 |
--------------------------------------------------------------------------------
/examples/benchmarks/dev_files.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 |
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 |
28 |
29 | def parse_args():
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 | return parser.parse_args()
33 |
34 |
35 | def main():
36 | args = parse_args()
37 |
38 | if 'jax' in args.config:
39 | dev_query = {
40 | 'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'}
41 | if 'hadoop' in args.config:
42 | dev_query = {
43 | 'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'}
44 | if 'kibana' in args.config:
45 | dev_query = {
46 | 'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'}
47 |
48 | if 'tensorflow' in args.config:
49 | dev_query = {
50 | 'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'}
51 |
52 | start = datetime.now()
53 | mine_manager = MineManager(config_path=args.config)
54 | files = mine_manager.dev_miner.get_files(
55 | dev_query['hash'],
56 | mine_manager.config.ct.project_id
57 | )
58 | ft = [f['type'] for f in files]
59 | grouped = [{'file': x, 'count': len(
60 | [y for y in ft if x == y])} for x in set(ft)]
61 |
62 | print('Dev file types took {}'.format(datetime.now() - start))
63 | print('Nr files', len(ft))
64 |
65 |
66 | if __name__ == '__main__':
67 | main()
68 |
--------------------------------------------------------------------------------
/examples/benchmarks/dev_methods.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 |
21 | from datetime import datetime
22 | from graphrepo.miners import MineManager
23 |
24 |
25 | def parse_args():
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
28 | return parser.parse_args()
29 |
30 |
31 | def main():
32 | args = parse_args()
33 |
34 | if 'jax' in args.config:
35 | dev_query = {
36 | 'hash': '93476add93abfb4fcfdd5c61ed811099bbb2aab70874f554d38bf381'}
37 | if 'hadoop' in args.config:
38 | dev_query = {
39 | 'hash': 'c92a1ec4e3eec053698d080439dc284a824b4de6fd5a4c8351631685'}
40 | if 'kibana' in args.config:
41 | dev_query = {
42 | 'hash': 'bc95ed12093e3ca5ce0b30f4edda5b3692510d87b0b0bd08d2999750'}
43 | if 'tensorflow' in args.config:
44 | dev_query = {
45 | 'hash': '1dfed5c1dfcb5c5eaf63522b7d993b721774bb153ef4be087384e72e'}
46 |
47 | start = datetime.now()
48 | mine_manager = MineManager(config_path=args.config)
49 | method_updates = mine_manager.dev_miner.get_method_updates(
50 | dev_query['hash'],
51 | mine_manager.config.ct.project_id
52 | )
53 | complexity = [c['complexity']
54 | for c in method_updates if c['complexity'] != -1]
55 | _ = sum(complexity) / len(complexity)
56 |
57 | print('Dev file types took {}'.format(datetime.now() - start))
58 | print('Nr method updates', len(method_updates))
59 |
60 |
61 | if __name__ == '__main__':
62 | main()
63 |
--------------------------------------------------------------------------------
/examples/benchmarks/file_nloc.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 |
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 |
28 |
29 | def parse_args():
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 | parser.add_argument('--plot', default=False, type=bool)
33 | return parser.parse_args()
34 |
35 |
36 | def main():
37 | args = parse_args()
38 |
39 | if 'jax' in args.config:
40 | file_query = {
41 | 'hash': '84a34a3b24d33ba7736a19f7009591d6d4af6aa4368680664fd3a5ae'}
42 | if 'hadoop' in args.config:
43 | file_query = {
44 | 'hash': '0f3a2c18d68cf908803c5493a39f5039b7effa929ada77b43325e806'}
45 |
46 | if 'kibana' in args.config:
47 | file_query = {
48 | 'hash': 'bafb026d5ad56f9975c0feb6ea387126b8d953e5061c26ed11737b48'
49 | }
50 | if 'tensorflow' in args.config:
51 | file_query = {
52 | 'hash': 'd5204d385a92141e49aa8ce8b6330fafd825c02e4ee5ed86747c8e73'
53 | }
54 |
55 | start = datetime.now()
56 |
57 | mine_manager = MineManager(config_path=args.config)
58 | updated_file_rels = mine_manager.file_miner.get_change_history(
59 | file_hash=file_query['hash'])
60 | nloc = [x['nloc'] for x in updated_file_rels]
61 |
62 | print('File nloc took {}'.format(datetime.now() - start))
63 | print('File changes', len(updated_file_rels))
64 | # print(updated_file_rels.data)
65 |
66 |
67 | if __name__ == '__main__':
68 | main()
69 |
--------------------------------------------------------------------------------
/examples/configs/graphrepo.yml:
--------------------------------------------------------------------------------
1 |
2 | neo:
3 | db_url: localhost
4 | port: 7687
5 | db_user: neo4j
6 | db_pwd: neo4jj
7 | batch_size: 50
8 |
9 | project:
10 | repo: repos/GraphRepo/
11 | start_date: #"1 February, 2018"
12 | end_date: #"30 March, 2018"
13 | project_id: 'graphrepo'
14 | index_code: False
15 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/configs/grepo-test.yml:
--------------------------------------------------------------------------------
1 |
2 | neo:
3 | db_url: localhost
4 | port: 7687
5 | db_user: neo4j
6 | db_pwd: neo4jj
7 | batch_size: 50
8 |
9 | project:
10 | repo: repos/gr-testbench/
11 | start_date: #"1 February, 2018"
12 | end_date: #"30 March, 2018"
13 | project_id: 'graphrepo-testbench'
14 | index_code: False
15 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/configs/hadoop.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 50
7 |
8 | project:
9 | repo: repos/hadoop/
10 | start_date: "1 January, 2017 00:00"
11 | end_date: "1 January, 2018 00:00"
12 | project_id: hadoop
13 | index_code: True
14 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/configs/jax.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 50
7 |
8 | project:
9 | repo: repos/jax/
10 | start_date: "1 January, 2019 00:00"
11 | end_date: "1 May, 2020 00:00"
12 | project_id: jax
13 | index_code: True
14 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/configs/kibana.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 50
7 |
8 | project:
9 | repo: repos/kibana/
10 | start_date: "1 June, 2018 00:00"
11 | end_date: "1 June, 2019 00:00"
12 | project_id: kibana
13 | index_code: True
14 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/configs/pydriller.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 50
7 |
8 | project:
9 | repo: repos/pydriller/
10 | start_date: #"1 February, 2018"
11 | end_date: #"30 March, 2018"
12 | project_id: 'pydriller'
13 | index_code: False
14 | index_developer_email: True
15 |
--------------------------------------------------------------------------------
/examples/configs/tensorflow.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 50
7 |
8 | project:
9 | repo: repos/tensorflow/
10 | start_date: "1 January, 2020 00:00"
11 | end_date: "1 March, 2020 00:00"
12 | project_id: 'tensorflow'
13 | index_code: True
14 | index_developer_email: True
--------------------------------------------------------------------------------
/examples/dev_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 |
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 |
28 |
29 | def parse_args():
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument(
32 | '--config', default='examples/configs/pydriller.yml', type=str)
33 | return parser.parse_args()
34 |
35 |
36 | def main():
37 | args = parse_args()
38 | mine_manager = MineManager(config_path=args.config)
39 | files = mine_manager.dev_miner.get_files(
40 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
41 | mine_manager.config.ct.project_id
42 | )
43 | print(len(files), ' files')
44 |
45 | file_updates = mine_manager.dev_miner.get_files_updates(
46 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
47 | mine_manager.config.ct.project_id
48 | )
49 | print(len(file_updates), ' file updates')
50 |
51 | methods = mine_manager.dev_miner.get_methods(
52 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
53 | mine_manager.config.ct.project_id
54 | )
55 | print(len(methods), ' methods')
56 |
57 | method_updates = mine_manager.dev_miner.get_method_updates(
58 | "6cf1f138e29c1bf82810ad0b73012302e0d20c2f76a24e3b225017b0",
59 | mine_manager.config.ct.project_id
60 | )
61 | print(len(method_updates), ' method updates')
62 |
63 |
64 | if __name__ == '__main__':
65 | main()
66 |
--------------------------------------------------------------------------------
/examples/file_complexity.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ###
17 | # This file assumes the project from the config file was already indexed
18 | ###
19 | import argparse
20 | import os
21 | import pandas as pd
22 | import plotly.express as px
23 |
24 | from datetime import datetime
25 | from graphrepo.miners import MineManager
26 | from graphrepo.utils import parse_config
27 |
28 |
29 | def parse_args():
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument('--config', default='configs/pydriller.yml', type=str)
32 | return parser.parse_args()
33 |
34 |
35 | def main():
36 | args = parse_args()
37 | mine_manager = MineManager(config_path=args.config)
38 |
39 | file_miner = mine_manager.file_miner
40 | file_ = file_miner.query(pproject_id=mine_manager.config.ct.project_id,
41 | name="commit.py")
42 | updated_file_rels = file_miner.get_change_history(file_['hash'])
43 |
44 | # sort update relationships and transform data for plotting
45 | updated_file_rels.sort(key=lambda x: x['timestamp'])
46 |
47 | complexity = [x['complexity'] for x in updated_file_rels]
48 | nloc = [x['nloc'] for x in updated_file_rels]
49 | dts = [datetime.fromtimestamp(x['timestamp']) for x in updated_file_rels]
50 |
51 | fig = px.line(pd.DataFrame({'date': dts, 'complexity': complexity}),
52 | x='date', y='complexity',
53 | title='Complexity over time for the commit.py file')
54 | fig.show()
55 |
56 | fig_2 = px.line(pd.DataFrame({'date': dts, 'nloc': nloc}),
57 | x='date', y='nloc', title="NLOC over time for the commit.py file")
58 | fig_2.show()
59 |
60 |
61 | if __name__ == '__main__':
62 | main()
63 |
--------------------------------------------------------------------------------
/examples/index_all.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is an example of indexing all data from a repository in Neo4j"""
15 |
16 | import argparse
17 | from graphrepo.drillers import Driller
18 |
19 |
20 | def parse_args():
21 | """Parse argument"""
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument(
24 | '--config', default='examples/configs/pydriller.yml', type=str)
25 | return parser.parse_args()
26 |
27 |
28 | def main():
29 | """Main method"""
30 | args = parse_args()
31 | driller = Driller(config_path=args.config)
32 | # this method should be called only once, when initializing
33 | # a database for the first time
34 | try:
35 | driller.init_db()
36 | except Exception as exc:
37 | print("DB already initialized")
38 | driller.drill_batch()
39 | driller.merge_all()
40 |
41 |
42 | if __name__ == '__main__':
43 | main()
44 |
--------------------------------------------------------------------------------
/examples/mine_all.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import os
17 | import yaml
18 | from graphrepo.miners import MineManager
19 | from datetime import datetime
20 |
21 |
22 | def parse_args():
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument(
25 | '--config', default='examples/configs/pydriller.yml', type=str)
26 | return parser.parse_args()
27 |
28 |
29 | def main():
30 | args = parse_args()
31 |
32 | start = datetime.now()
33 | miner = MineManager(config_path=args.config)
34 |
35 | # get all nodes and relationships from the manager
36 | nodes, rels = miner.get_all_data()
37 | print("The DB has a total of {} nodes and {} relationships".format(
38 | len(nodes), len(rels)))
39 | print("All data took: {}".format(datetime.now() - start))
40 |
41 | # get all commits
42 | commits = miner.commit_miner.get_all()
43 | print("The DB has a total of {} commits".format(len(commits)))
44 |
45 | # get all developers
46 | devs = miner.dev_miner.get_all()
47 | print("The DB has a total of {} developers".format(len(devs)))
48 |
49 | # get all files
50 | files = miner.file_miner.get_all()
51 | print("The DB has a total of {} files".format(len(files)))
52 |
53 |
54 | if __name__ == '__main__':
55 | main()
56 |
--------------------------------------------------------------------------------
/graphrepo/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/graphrepo/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """This module stores all config constants. It is a singleton
16 | because it is used across several modules inside the app"""
17 |
18 | from graphrepo.singleton import Singleton
19 | from graphrepo.utils import Dotdict
20 |
21 |
22 | class Config(metaclass=Singleton):
23 | """This class contains all config flags"""
24 | ct = {}
25 |
26 | def configure(self, **kwargs):
27 | """Stores configuration contants, parsed
28 | from yaml config file
29 | :param kwargs: keys and values from config
30 | """
31 | self.ct = Dotdict(kwargs)
32 |
33 | def check_config(self):
34 | """Checks if the config properties are set and
35 | raises ValueError if any value misses"""
36 |
37 | if not self.ct.db_url or not self.ct.port \
38 | or not self.ct.db_user or not self.ct.db_pwd:
39 | raise ValueError("Neo4j configuartion is invalid.")
40 |
--------------------------------------------------------------------------------
/graphrepo/drillers/__init__.py:
--------------------------------------------------------------------------------
1 | from .driller import *
2 | from .cache_driller import *
3 | from .stomp_driller import *
4 | from .queue_driller import *
5 |
--------------------------------------------------------------------------------
/graphrepo/drillers/batch_utils.py:
--------------------------------------------------------------------------------
1 | """This module is the wild wild west of batch indexing :-)
2 | In contains all Neo4j queries for indexing the data in batches.
3 | More documentation will follow soon.
4 | """
5 | from datetime import datetime
6 |
7 |
8 | def batch(iterable, n=1):
9 | l = len(iterable)
10 | for ndx in range(0, l, n):
11 | yield iterable[ndx:min(ndx + n, l)]
12 |
13 |
14 | def index_commits(graph, commits, batch_size=100):
15 | query = """
16 | UNWIND {commits} AS c
17 | MERGE (nc :Commit { hash: c.hash})
18 | ON CREATE SET
19 | nc = c
20 | ON MATCH SET
21 | nc = c
22 | """
23 | for b in batch(commits, batch_size):
24 | graph.run(query, commits=b)
25 |
26 |
27 | def index_parent_commits(graph, parents, batch_size=100):
28 | query = """
29 | UNWIND {ac} AS a
30 | MATCH (x:Commit),(y:Commit)
31 | WHERE x.hash = a.parent_hash AND y.hash = a.child_hash
32 | MERGE (x)-[r:Parent{}]->(y)
33 | """
34 | for b in batch(parents, batch_size):
35 | graph.run(query, ac=b)
36 |
37 |
38 | def index_authors(graph, authors, batch_size=100):
39 | query = """
40 | UNWIND {authors} AS a
41 | MERGE (nd:Developer { hash: a.hash})
42 | ON CREATE SET nd = a
43 | ON MATCH SET nd = a
44 | """
45 | for b in batch(authors, batch_size):
46 | graph.run(query, authors=b)
47 |
48 |
49 | def index_branches(graph, branches, batch_size=100):
50 | query = """
51 | UNWIND {branches} AS a
52 | MERGE (nb:Branch { hash: a.hash})
53 | ON CREATE SET nb = a
54 | ON MATCH SET nb = a
55 | """
56 | for b in batch(branches, batch_size):
57 | graph.run(query, branches=b)
58 |
59 |
60 | def index_branch_commits(graph, bc, batch_size=100):
61 | query = """
62 | UNWIND {ac} AS a
63 | MATCH (x:Branch),(y:Commit)
64 | WHERE x.hash = a.branch_hash AND y.hash = a.commit_hash
65 | MERGE (x)-[r:BranchCommit{}]->(y)
66 | """
67 | for b in batch(bc, batch_size):
68 | graph.run(query, ac=b)
69 |
70 |
71 | def index_files(graph, files, batch_size=100):
72 | query = """
73 | UNWIND {files} AS f
74 | MERGE (nf:File { hash: f.hash})
75 | ON CREATE SET nf = f
76 | ON MATCH SET nf = f
77 | """
78 | for b in batch(files, batch_size):
79 | graph.run(query, files=b)
80 |
81 |
82 | def index_methods(graph, methods, batch_size=100):
83 | query = """
84 | UNWIND {methods} AS f
85 | MERGE (nm:Method { hash: f.hash})
86 | ON CREATE SET nm = f
87 | ON MATCH SET nm = f
88 | """
89 |
90 | for b in batch(methods, batch_size):
91 | graph.run(query, methods=b)
92 |
93 |
94 | def index_author_commits(graph, ac, batch_size=100):
95 | query = """
96 | UNWIND {ac} AS a
97 | MATCH (x:Developer),(y:Commit)
98 | WHERE x.hash = a.author_hash AND y.hash = a.commit_hash
99 | MERGE (x)-[r:Author{timestamp: a.timestamp}]->(y)
100 | """
101 | for b in batch(ac, batch_size):
102 | graph.run(query, ac=b)
103 |
104 |
105 | def index_commit_files(graph, cf, batch_size=100):
106 | query = """
107 | UNWIND {cf} AS a
108 | MATCH (x:Commit),(y:File)
109 | WHERE x.hash = a.commit_hash AND y.hash = a.file_hash
110 | MERGE (x)-[r:UpdateFile{}]->(y)
111 | ON CREATE SET r=a['attributes']
112 | """
113 | for i, b in enumerate(batch(cf, batch_size)):
114 | graph.run(query, cf=b)
115 |
116 |
117 | def index_file_methods(graph, cf, batch_size=100):
118 | query = """
119 | UNWIND {cf} AS a
120 | MATCH (x:File),(y:Method)
121 | WHERE x.hash = a.file_hash AND y.hash = a.method_hash
122 | MERGE (x)-[r:Method{}]->(y)
123 | """
124 | for b in batch(cf, batch_size):
125 | graph.run(query, cf=b)
126 |
127 |
128 | def index_commit_method(graph, cm, batch_size=100):
129 | query = """
130 | UNWIND {cf} AS a
131 | MATCH (x:Commit),(y:Method)
132 | WHERE x.hash = a.commit_hash AND y.hash = a.method_hash
133 | MERGE (x)-[r:UpdateMethod]->(y)
134 | ON CREATE SET r=a['attributes']
135 | """
136 | for i, b in enumerate(batch(cm, batch_size)):
137 | graph.run(query, cf=b)
138 |
139 |
140 | def create_index_authors(graph):
141 | query = """
142 | CREATE INDEX ON :Developer(hash)
143 | """
144 | graph.run(query)
145 |
146 |
147 | def create_index_commits(graph, hash=True):
148 | if hash:
149 | hash_q = """
150 | CREATE INDEX ON :Commit(hash)
151 | """
152 | graph.run(hash_q)
153 |
154 | pid_q = """
155 | CREATE INDEX ON :Commit(project_id)
156 | """
157 |
158 | graph.run(pid_q)
159 |
160 |
161 | def create_index_branches(graph, hash=True):
162 | if hash:
163 | hash_q = """
164 | CREATE INDEX ON :Branch(hash)
165 | """
166 | graph.run(hash_q)
167 |
168 | pid_q = """
169 | CREATE INDEX ON :Branch(project_id)
170 | """
171 | graph.run(pid_q)
172 |
173 |
174 | def create_index_files(graph, hash=True):
175 | if hash:
176 | hash_q = """
177 | CREATE INDEX ON :File(hash)
178 | """
179 | graph.run(hash_q)
180 |
181 | mhash_q = """
182 | CREATE INDEX ON :File(merge_hash)
183 | """
184 | graph.run(mhash_q)
185 |
186 | pid_q = """
187 | CREATE INDEX ON :File(project_id)
188 | """
189 | graph.run(pid_q)
190 |
191 |
192 | def create_index_methods(graph, hash=True):
193 | if hash:
194 | hash_q = """
195 | CREATE INDEX ON :Method(hash)
196 | """
197 | graph.run(hash_q)
198 |
199 | mhash_q = """
200 | CREATE INDEX ON :Method(merge_hash)
201 | """
202 | graph.run(mhash_q)
203 |
204 | pid_q = """
205 | CREATE INDEX ON :Method(project_id)
206 | """
207 | graph.run(pid_q)
208 |
209 |
210 | def merge_renamed_files(graph, project_id):
211 | query = """
212 | MATCH (n1:File),(n2:File)
213 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.merge_hash and id(n1) < id(n2)
214 | WITH [n1,n2] as ns
215 | order by id(ns[1]) desc
216 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
217 | MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf
218 | with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows
219 | unwind allRows as row
220 | match (mu: Method {{hash: row.hash}})
221 | SET mu.merge_hash = row.new_hash""".format(project_id)
222 | graph.run(query)
223 |
224 | def merge_new_files(graph, project_id):
225 | query = """
226 | MATCH (n1:File),(n2:File)
227 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}" and n1.merge_hash = n2.hash and id(n1) < id(n2)
228 | WITH [n1,n2] as ns
229 | order by id(ns[1]) desc
230 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
231 | MATCH (f:File {{hash: node.hash}}) -[]->(mf:Method) WITH DISTINCT f, mf
232 | with collect({{hash: mf.hash, new_hash: f.hash}}) as allRows
233 | unwind allRows as row
234 | match (mu: Method {{hash: row.hash}})
235 | SET mu.merge_hash = row.new_hash
236 | """.format(project_id)
237 | graph.run(query)
238 |
239 |
240 | def merge_methods(graph, project_id):
241 | query = """
242 | MATCH (n1:Method),(n2:Method)
243 | WHERE n1.project_id = "{0}" and n2.project_id = "{0}"
244 | and n1.file_name = n2.file_name and n1.name = n2.name and n1.project_id = n2.project_id and n1.merge_hash = n2.merge_hash and id(n1) < id(n2)
245 | WITH [n1,n2] as ns
246 | order by id(ns[1]) desc
247 | CALL apoc.refactor.mergeNodes(ns, {{properties: 'overwrite', mergeRels:true}}) YIELD node
248 | return node
249 | """.format(project_id)
250 | graph.run(query)
251 |
252 |
253 | def merge_files(graph, config):
254 | print('Merging moved files and methods')
255 | start = datetime.now()
256 | merge_renamed_files(graph, config.project_id)
257 | merge_methods(graph, config.project_id)
258 | merge_new_files(graph, config.project_id)
259 | merge_methods(graph, config.project_id)
260 | print('Merged files and methods \t', datetime.now()-start)
261 |
262 | def index_all(graph, developers, commits, parents, dev_commits, branches,
263 | branches_commits, files, commit_files, methods, file_methods,
264 | commit_methods, config):
265 |
266 | total = datetime.now()
267 |
268 | batch_size = config.batch_size
269 |
270 | developers = list({v['hash']: v for v in developers}.values())
271 | print('Indexing ', len(developers), ' authors')
272 | start = datetime.now()
273 | index_authors(graph, developers, batch_size)
274 | print('Indexed authors in: \t', datetime.now()-start)
275 |
276 | print('Indexing ', len(commits), ' commits')
277 | start = datetime.now()
278 | index_commits(graph, commits, batch_size)
279 | print('Indexed commits in: \t', datetime.now()-start)
280 |
281 | branches = list({v['hash']: v for v in branches}.values())
282 | branches_commits = list({str(i): i for i in branches_commits}.values())
283 | print('Indexing ', len(branches), ' branches')
284 | start = datetime.now()
285 | index_branches(graph, branches, batch_size)
286 | index_branch_commits(graph, branches_commits, batch_size)
287 | print('Indexed branches in: \t', datetime.now()-start)
288 |
289 | files = list({v['hash']: v for v in files}.values())
290 | print('Indexing ', len(files), ' files')
291 | start = datetime.now()
292 | index_files(graph, files, batch_size)
293 | print('Indexed files in: \t', datetime.now()-start)
294 |
295 | methods = list({v['hash']: v for v in methods}.values())
296 | print('Indexing ', len(methods), ' methods')
297 | start = datetime.now()
298 | index_methods(graph, methods, batch_size)
299 | print('Indexed methods in: \t', datetime.now()-start)
300 |
301 | parents = list({str(i): i for i in parents}.values())
302 | print('Indexing ', len(parents), ' parent commits')
303 | start = datetime.now()
304 | index_parent_commits(graph, parents, batch_size)
305 | print('Indexed commits in: \t', datetime.now()-start)
306 |
307 | print('Indexing ', len(dev_commits), ' author_commits')
308 | start = datetime.now()
309 | index_author_commits(graph, dev_commits, batch_size)
310 | print('Indexed author_commits in: \t', datetime.now()-start)
311 |
312 | file_methods = list({str(i): i for i in file_methods}.values())
313 | print('Indexings ', len(file_methods), ' file_methods')
314 | start = datetime.now()
315 | index_file_methods(graph, file_methods, batch_size)
316 | print('Indexed file_methods in: \t', datetime.now()-start)
317 |
318 | print('Indexing ', len(commit_methods), ' commit_methods')
319 | start = datetime.now()
320 | index_commit_method(graph, commit_methods, batch_size)
321 | print('Indexed commit_methods in: \t', datetime.now()-start)
322 |
323 | print('Indexing ', len(commit_files), ' commit_files')
324 | start = datetime.now()
325 | index_commit_files(graph, commit_files, batch_size)
326 | print('Indexed commit_files in: \t', datetime.now()-start)
327 | print('Indexing took: \t', datetime.now()-total)
328 |
329 |
330 | def index_cache(graph, cache, config):
331 | batch_size = config.batch_size
332 | total = datetime.now()
333 | index_authors(graph, list(
334 | {v['hash']: v for v in cache.data['developers']}.values()), batch_size)
335 | index_commits(graph, cache.data['commits'], batch_size)
336 | index_branches(graph, list(
337 | {v['hash']: v for v in cache.data['branches']}.values()), batch_size)
338 | index_branch_commits(graph, list(
339 | {str(i): i for i in cache.data['branches_commits']}.values()), batch_size)
340 | index_files(graph, list(
341 | {v['hash']: v for v in cache.data['files']}.values()), batch_size)
342 | index_methods(graph, list(
343 | {v['hash']: v for v in cache.data['methods']}.values()), batch_size)
344 | index_parent_commits(graph, list(
345 | {str(i): i for i in cache.data['parents']}.values()), batch_size)
346 | index_author_commits(graph, cache.data['dev_commits'], batch_size)
347 | index_file_methods(graph, list(
348 | {str(i): i for i in cache.data['file_methods']}.values()), batch_size)
349 | index_commit_method(graph, cache.data['commit_methods'], batch_size)
350 | index_commit_files(graph, cache.data['commit_files'], batch_size)
351 | print('Indexing took: \t', datetime.now()-total)
352 |
--------------------------------------------------------------------------------
/graphrepo/drillers/cache_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """ This module uses pydriller to search a repository
16 | and indexes it in neo4j
17 | """
18 | from datetime import datetime
19 | from pydriller import RepositoryMining
20 |
21 | import graphrepo.utils as utl
22 | import graphrepo.drillers.batch_utils as b_utl
23 | from graphrepo.drillers.drill_cache import DrillCache, DrillCacheSequential
24 | from graphrepo.drillers.default import DefaultDriller
25 | from graphrepo.logger import Logger
26 |
27 | LG = Logger()
28 |
29 |
30 | class CacheDriller(DefaultDriller):
31 | """CacheDriller class - parses a git repo and uses the models
32 | to index everything in Neo4j by storing all data on disk.
33 | """
34 |
35 | def drill_batch_cache_sequential(self, index=True):
36 | """Extracts all information from a git repository
37 | and it stores in in a disk cache
38 | :param index: optional, if True, the data is indexed in Neo4j
39 | :returns: cache with all data
40 | """
41 | start = datetime.now()
42 | print('Driller started at: \t', start)
43 | cache = DrillCacheSequential()
44 | for commit in \
45 | RepositoryMining(self.config.ct.repo,
46 | since=self.config.ct.start_date,
47 | to=self.config.ct.end_date).traverse_commits():
48 | timestamp = commit.author_date.timestamp()
49 | dev = utl.format_dev(commit, self.config.ct.index_developer_email)
50 | cache.append_cache('developers', dev)
51 | com = utl.format_commit(commit, self.config.ct.project_id)
52 | cache.append_cache('commits', com)
53 | cache.append_cache(
54 | 'dev_commits',
55 | utl.format_author_commit(dev, com, timestamp))
56 | for parent in commit.parents:
57 | cache.append_cache('parents', utl.format_parent_commit(
58 | com['hash'], parent, self.config.ct.project_id))
59 | for branch in commit.branches:
60 | br_ = utl.format_branch(branch, self.config.ct.project_id)
61 | cache.append_cache('branches', br_)
62 | cache.append_cache('branches_commits', utl.format_branch_commit(
63 | br_['hash'], com['hash']))
64 | for file in commit.modifications:
65 | fl_ = utl.format_file(file, self.config.ct.project_id)
66 | cache.append_cache('files', fl_)
67 | cache.append_cache('commit_files', utl.format_commit_file(
68 | com['hash'], file, timestamp, self.config.ct.project_id))
69 | for method in file.changed_methods:
70 | met = utl.format_method(
71 | method, file, self.config.ct.project_id)
72 | cache.append_cache('methods', met)
73 | cache.append_cache(
74 | 'file_methods',
75 | utl.format_file_method(fl_['hash'],
76 | met['hash']))
77 | cache.append_cache('commit_methods',
78 | utl.format_commit_method(
79 | com['hash'],
80 | met['hash'],
81 | method,
82 | timestamp))
83 | print('Driller finished in: \t', datetime.now() - start)
84 | if index:
85 | self.index_batch(cache)
86 | return cache
87 |
88 | def index_batch(self, cache):
89 | """Indexes cached data to Neo4j
90 | :param cache: diskcache Cache or Index
91 | """
92 | try:
93 | self.config.check_config()
94 | self._check_connection()
95 | b_utl.index_cache(
96 | self.graph, cache, config=self.config.ct)
97 | except Exception as exc:
98 | LG.log_and_raise(exc)
99 | else:
100 | return
101 |
102 | def drill_batch_cache_all(self, index=True):
103 | """Extracts the information from a repository in memory
104 | and caches it after the extraction
105 | :param index: optional, if True, the data is indexed in Neo4j
106 | """
107 | data = self.drill_batch(index=False)
108 | cache = DrillCache(data)
109 | if index:
110 | self.index_batch(cache)
111 | return cache
112 |
--------------------------------------------------------------------------------
/graphrepo/drillers/db_init.py:
--------------------------------------------------------------------------------
1 | """This module initializes the Neo4j indexes"""
2 | import graphrepo.drillers.batch_utils as utils
3 |
4 |
5 | def create_hash_constraints(graph):
6 | """Creates uniqueness constratins on nodes' hash"""
7 | query = """CREATE CONSTRAINT ON (n: {}) ASSERT n.hash IS UNIQUE"""
8 | nodes = ["Developer", "Branch", "Commit", "File", "Method"]
9 | for node in nodes:
10 | fquery = query.format(node)
11 | graph.run(fquery)
12 |
13 |
14 | def create_indices(graph, hash_index=True):
15 | """Initializes all indexes for database"""
16 | if hash_index:
17 | utils.create_index_authors(graph)
18 | utils.create_index_branches(graph, hash_index)
19 | utils.create_index_commits(graph, hash_index)
20 | utils.create_index_files(graph, hash_index)
21 | utils.create_index_methods(graph, hash_index)
22 |
--------------------------------------------------------------------------------
/graphrepo/drillers/default.py:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2021 GraphRepo
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Default Parent class for drillers
16 | """
17 | from abc import abstractmethod
18 | from datetime import datetime
19 | from py2neo import Graph
20 | from pydriller import RepositoryMining
21 |
22 | import graphrepo.utils as utl
23 | import graphrepo.drillers.batch_utils as b_utl
24 | import graphrepo.drillers.db_init as db_init
25 | from graphrepo.config import Config
26 | from graphrepo.logger import Logger
27 | LG = Logger()
28 |
29 |
30 | class DefaultDriller():
31 | """DefaultDriller class - parses a git repo and uses the models
32 | to index everything in Neo4j.
33 | """
34 |
35 | def __init__(self, config_path):
36 | """Initializes the properties of this class
37 | :param config_path: path to yml config file
38 | """
39 | try:
40 | if not config_path:
41 | raise FileNotFoundError
42 | neo, project = utl.parse_config(config_path)
43 | self.config = Config()
44 | self.graph = None
45 | self.config.configure(**neo, **project)
46 | self._connect()
47 | except Exception as exc:
48 | LG.log_and_raise(exc)
49 |
50 | def _connect(self):
51 | """Instantiates the connection to Neo4j and stores
52 | the graph internally.
53 | Throws exception if the connection can not pe realized
54 | """
55 | try:
56 | self.graph = Graph(host=self.config.ct.db_url,
57 | user=self.config.ct.db_user,
58 | password=self.config.ct.db_pwd,
59 | port=self.config.ct.port)
60 | except Exception as exc:
61 | LG.log_and_raise(exc)
62 |
63 | def _check_connection(self):
64 | """Checks if there is a db connection and raises
65 | ReferenceError if not.
66 | """
67 | try:
68 | self._connect()
69 | except:
70 | raise ReferenceError("There is no valid "
71 | "database connection. Please "
72 | "configure and connect first.")
73 |
74 | def init_db(self):
75 | """Runs initialization of a database; creates
76 | constraints and indexes"""
77 | try:
78 | self._check_connection()
79 | db_init.create_hash_constraints(self.graph)
80 | db_init.create_indices(self.graph, hash_index=False)
81 | except Exception as exc:
82 | raise exc
83 |
84 | def clean(self):
85 | """Removes all data in a graph
86 | """
87 | try:
88 | self.config.check_config()
89 | self._check_connection()
90 |
91 | self.graph.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r")
92 | except Exception as exc:
93 | LG.log_and_raise(exc)
94 |
95 | def drill_batch(self, index=True, save_path=None):
96 | """Extracts data from a software repository, with the option
97 | of saving it on diks and indexing it in Neo4j
98 | :param index: optional, if True, the data is indexed in Neo4j
99 | :param save_path: optional, if given, the data is stored on dik
100 | :returns: dictionary with all data
101 | """
102 | start = datetime.now()
103 | print('Driller started at: \t', start)
104 | commits, parents, devs, dev_com, branches,\
105 | branches_com, files, com_files, \
106 | methods, files_methods, com_methods = \
107 | [], [], [], [], [], [], [], [], [], [], []
108 | for commit in \
109 | RepositoryMining(self.config.ct.repo,
110 | since=self.config.ct.start_date,
111 | to=self.config.ct.end_date).traverse_commits():
112 | self.drill_commit(commit, commits, parents, devs, dev_com, branches,
113 | branches_com, files, com_files,
114 | methods, files_methods, com_methods)
115 |
116 | data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches,
117 | branches_com, files, com_files,
118 | methods, files_methods, com_methods)
119 |
120 | print('Driller finished in: \t', datetime.now() - start)
121 |
122 | if save_path:
123 | utl.save_json(save_path, data_)
124 | if index:
125 | self.index_batch(**data_)
126 | return data_
127 |
128 | def drill_commit(self, commit, commits, parents, devs, dev_com, branches,
129 | branches_com, files, com_files,
130 | methods, files_methods, com_methods):
131 | """Helper method - works with pass by reference"""
132 | timestamp = commit.author_date.timestamp()
133 | dev = utl.format_dev(commit, self.config.ct.index_developer_email)
134 | devs.append(dev)
135 | com = utl.format_commit(commit, self.config.ct.project_id)
136 | commits.append(com)
137 | dev_com.append(utl.format_author_commit(dev, com, timestamp))
138 | for parent in commit.parents:
139 | parents.append(utl.format_parent_commit(
140 | com['hash'], parent, self.config.ct.project_id))
141 | for branch in commit.branches:
142 | br_ = utl.format_branch(branch, self.config.ct.project_id)
143 | branches.append(br_)
144 | branches_com.append(
145 | utl.format_branch_commit(br_['hash'], com['hash']))
146 | for file in commit.modifications:
147 | fl_ = utl.format_file(file, self.config.ct.project_id)
148 | files.append(fl_)
149 | com_files.append(utl.format_commit_file(
150 | com['hash'], file,
151 | timestamp, self.config.ct.project_id, self.config.ct.index_code))
152 | for method in file.changed_methods:
153 | met = utl.format_method(
154 | method, file, self.config.ct.project_id)
155 | methods.append(met)
156 | files_methods.append(
157 | utl.format_file_method(fl_['hash'], met['hash'])
158 | )
159 | com_methods.append(
160 | utl.format_commit_method(com['hash'], met['hash'],
161 | method, timestamp))
162 |
163 | def data_dot_dict(self, commits, parents, devs, dev_com, branches,
164 | branches_com, files, com_files,
165 | methods, files_methods, com_methods):
166 | """Helper method"""
167 | return utl.Dotdict({'commits': commits,
168 | 'parents': parents,
169 | 'developers': devs,
170 | 'dev_commits': dev_com,
171 | 'branches': branches,
172 | 'branches_commits': branches_com,
173 | 'files': files,
174 | 'commit_files': com_files,
175 | 'methods': methods,
176 | 'file_methods': files_methods,
177 | 'commit_methods': com_methods})
178 |
179 | @abstractmethod
180 | def index_batch(self):
181 | """Abstract index batch driller method
182 | """
183 | raise NotImplementedError
184 |
185 |
186 | def merge_all(self):
187 | """Merges file renaming and methods"""
188 | try:
189 | b_utl.merge_files(self.graph, self.config.ct)
190 | except Exception as exc:
191 | LG.log_and_raise(exc)
192 | else:
193 | return
194 |
--------------------------------------------------------------------------------
/graphrepo/drillers/delete_all.py:
--------------------------------------------------------------------------------
1 | # def delete_all():
2 | # # get total #of nodes
3 | # res = session.run("MATCH(n) RETURN COUNT(*) AS n")
4 | # total_nodes = 0
5 | # for item in res:
6 | # total_nodes = item["n"]
7 | # print("\n Existing nodes in db:", total_nodes)
8 |
9 | # # get total #of relationships
10 | # res1 = session.run("MATCH (n)-[r]->() RETURN COUNT(r) as r")
11 | # total_rels = 0
12 | # for item in res1:
13 | # total_rels = item["r"]
14 | # print("\n Existing relationships in db:", total_rels)
15 |
16 | # # delete all nodes in batches (for faster deletion)
17 | # while total_nodes > 0:
18 | # res = session.run(
19 | # "MATCH(n) WITH n LIMIT 10000 DETACH DELETE n RETURN COUNT(n) AS count")
20 | # count = 0
21 | # for item in res:
22 | # count = item["count"] # updates deleeted node count here
23 | # total_nodes = total_nodes-count
24 | # print("\n #of nodes in db after deletion completed = ", total_nodes)
25 |
26 |
27 | # start = time.time()
28 | # delete_all()
29 | # print("\n Pre cleanup time (sec): ", time.time()-start)
30 |
31 | # for prot in fileList:
32 | # print("\n\n", prot)
33 | # if os.path.exists(prot+"_AllCCs_maxDist11.csv"):
34 | # print("\n Already Processed.")
35 | # continue
36 | # start = time.time()
37 | # delete_all()
38 | # pre_time = time.time()-start
39 | # print("\n Pre cleanup time (sec): ", pre_time)
40 |
41 | # # Database preparation
42 | # session.run("CREATE INDEX ON :MyNode(Name)")
43 |
44 | # # 1. Create graph
45 | # start = time.time()
46 | # session.run("USING PERIODIC COMMIT "
47 | # "LOAD CSV FROM 'file:///'+{prot}+'_conflict_resolved.txt' AS line "
48 | # "MERGE (n:MyNode {Name:line[0]}) "
49 | # "MERGE (m:MyNode {Name:line[1]}) "
50 | # "MERGE (n) -[:TO {dist:line[2]}] -> (m) ", prot=prot)
51 |
52 | # end = time.time()
53 | # step1_time = end - start
54 | # print("\n Step 1 time (in sec) = ", end-start)
55 |
56 | # # 2 find CCs
57 | # start = time.time()
58 | # result = session.run("CALL algo.unionFind.stream('MyNode', 'TO', {graph:'huge'}) "
59 | # "YIELD nodeId,setId "
60 | # "MATCH (n) "
61 | # "WHERE id(n)=nodeId "
62 | # "WITH setId,collect(nodeId) as nodes, collect(n.Name) as labels,count(*) as size_of_component "
63 | # "ORDER BY size_of_component DESC "
64 | # "RETURN setId as componentId,size_of_component,labels as connectedTSRkeys ")
65 | # end = time.time()
66 | # step2_time = end - start
67 | # print("\n Step 2 time (in sec) = ", end-start)
68 | # # 3. save result
69 | # start = time.time()
70 | # # newline='' <- to avoid blank line between two rows
71 | # with open(prot+"_AllCCs_maxDist11.csv", "w") as csvfile:
72 | # writer = csv.writer(csvfile, delimiter=',')
73 | # writer.writerow(
74 | # ['componentId', 'size_of_component', 'connectedTSRkeys'])
75 | # for record in result:
76 | # record = str(record)[:-1].replace(", ",
77 | # ",").replace("'", "").split()
78 | # print("\n", record[1], record[2], record[3])
79 | # writer.writerow([record[1].split("=")[1], record[2].split("=")[
80 | # 1], record[3].split("=")[1]])
81 | # end = time.time()
82 | # step3_time = end - start
83 | # print("\n Step 3 time (in sec) = ", end-start)
84 |
85 | # # 4. delete graph
86 | # start = time.time()
87 | # delete_all()
88 | # end = time.time()
89 | # post_time = end - start
90 | # print("\n Post cleanup time (in sec) = ", end-start)
91 |
92 | # print("\n Total time = ", pre_time+step1_time +
93 | # step2_time+step3_time+post_time)
94 |
95 | # driver.close()
96 |
--------------------------------------------------------------------------------
/graphrepo/drillers/drill_cache.py:
--------------------------------------------------------------------------------
1 | """This module saves the cache data on disk"""
2 | import collections
3 | from diskcache import Index
4 |
5 |
6 | class DrillCache:
7 | """Class for storing all data at once in the cache"""
8 |
9 | def __init__(self, data):
10 | """Transforms dictionary to ordered dic and saves it"""
11 | dt_ = [(k, v) for k, v in data.items()]
12 | self.data = Index(collections.OrderedDict(dt_))
13 |
14 |
15 | class DrillCacheSequential:
16 | """Class for disk cache sequential"""
17 |
18 | def __init__(self):
19 | """Init drill cache"""
20 | self.data = Index([('commits', []),
21 | ('parents', []), ('developers', []),
22 | ('dev_commits', []), ('branches', []),
23 | ('branches_commits', []), ('files', []),
24 | ('commit_files', []), ('methods', []),
25 | ('file_methods', []), ('commit_methods', [])
26 | ])
27 |
28 | def append_cache(self, key, value):
29 | """Appends record to array on disk ccache
30 | :param key: data key
31 | :param value: value to append
32 | """
33 | temp_ = self.data[key]
34 | temp_.append(value)
35 | self.data[key] = temp_
36 |
--------------------------------------------------------------------------------
/graphrepo/drillers/driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """ This module uses pydriller to search a repository
16 | and indexes it in neo4j
17 | """
18 | from diskcache import Cache
19 | from datetime import datetime
20 | from py2neo import Graph
21 | from pydriller import RepositoryMining
22 |
23 | import graphrepo.utils as utl
24 | import graphrepo.drillers.batch_utils as b_utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.drill_cache import DrillCacheSequential
27 | from graphrepo.drillers.default import DefaultDriller
28 | from graphrepo.logger import Logger
29 |
30 | LG = Logger()
31 |
32 |
33 | class Driller(DefaultDriller):
34 | """Drill class - parses a git repo and uses the models
35 | to index everything in Neo4j. This class is a singleton
36 | because it holds the connection to Neo4j in self.graph
37 | """
38 |
39 | def index_batch(self, **kwargs):
40 | """Indexes data extracted by drill_batch of from
41 | disk in Neo4j
42 | :param kwargs: data keys and values (see the drill_batch return)
43 | """
44 | try:
45 | self.config.check_config()
46 | self._check_connection()
47 | b_utl.index_all(
48 | self.graph, config=self.config.ct, **kwargs)
49 | except Exception as exc:
50 | LG.log_and_raise(exc)
51 | else:
52 | return
53 |
54 | def index_from_file(self, file_path):
55 | """Reads a file and indexes the data in Neo4j
56 | :param file_path: the path of the JSON file with data
57 | """
58 | try:
59 | data_ = utl.load_json(file_path)
60 | self.index_batch(**data_)
61 | except Exception as exc:
62 | LG.log_and_raise(exc)
63 | else:
64 | return
--------------------------------------------------------------------------------
/graphrepo/drillers/queue_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | from abc import abstractmethod
17 | from datetime import datetime
18 | from py2neo import Graph
19 | from pydriller import RepositoryMining
20 |
21 | import graphrepo.utils as utl
22 | from graphrepo.config import Config
23 | from graphrepo.drillers.driller import Driller
24 | import graphrepo.drillers.batch_utils as b_utl
25 | from graphrepo.logger import Logger
26 |
27 | LG = Logger()
28 |
29 |
30 | class QueueDriller(Driller):
31 | """QueueDriller class - parses a git repo and publishes
32 | the data in a queue every n commits
33 | """
34 |
35 | def __init__(self, neo, project, queue):
36 | """Initializes the properties of this class
37 | :param neo:
38 | :param project:
39 | :param queue:
40 | """
41 | # TODO: validate inputs
42 | try:
43 | self.project, self.queue = project, queue
44 | self.config = Config()
45 | self.graph = None
46 | self.config.configure(**neo, **self.project)
47 | # self._connect()
48 | except Exception as exc:
49 | LG.log_and_raise(exc)
50 |
51 | @abstractmethod
52 | def connect_queue(self):
53 | """Establishes a connection to queue"""
54 | raise NotImplementedError
55 |
56 | @abstractmethod
57 | def send_index_data(self, data):
58 | """Indexes data"""
59 | raise NotImplementedError
60 |
61 | def drill_batch(self, index=True, save_path=None):
62 | """Extracts data from a software repository, with the option
63 | of saving it on diks and indexing it in Neo4j
64 | :param index: optional, if True, the data is indexed in Neo4j
65 | :param save_path: optional, if given, the data is stored on dik
66 | :returns: dictionary with all data
67 | """
68 | start = datetime.now()
69 | print('Driller started at: \t', start)
70 | commits, parents, devs, dev_com, branches,\
71 | branches_com, files, com_files, \
72 | methods, files_methods, com_methods = \
73 | [], [], [], [], [], [], [], [], [], [], []
74 | commit_index = 0
75 | for commit in \
76 | RepositoryMining(self.config.ct.repo,
77 | since=self.config.ct.start_date,
78 | to=self.config.ct.end_date).traverse_commits():
79 |
80 | self.drill_commit(commit, commits, parents, devs, dev_com, branches,
81 | branches_com, files, com_files,
82 | methods, files_methods, com_methods)
83 |
84 | if commit_index == self.queue['commit_batch'] - 1:
85 | data_ = self.data_dot_dict(commits, parents, devs, dev_com, branches,
86 | branches_com, files, com_files,
87 | methods, files_methods, com_methods)
88 |
89 | self.send_index_data(
90 | {'project_conf': self.project, 'data': data_})
91 |
92 | commits, parents, devs, dev_com, branches, branches_com, files, com_files, methods, files_methods, com_methods = [
93 | ], [], [], [], [], [], [], [], [], [], []
94 | commit_index = 0
95 | else:
96 | commit_index += 1
97 |
98 | print('Driller finished in: \t', datetime.now() - start)
99 |
--------------------------------------------------------------------------------
/graphrepo/drillers/rabbit_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | import json
17 | import pika
18 |
19 | from abc import abstractmethod
20 | from datetime import datetime
21 | from py2neo import Graph
22 | from pydriller import RepositoryMining
23 |
24 | import graphrepo.utils as utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.queue_driller import QueueDriller
27 | import graphrepo.drillers.batch_utils as b_utl
28 | from graphrepo.logger import Logger
29 |
30 | LG = Logger()
31 |
32 |
33 | class RabbitDriller(QueueDriller):
34 | """RabbitDriller class - parses a git repo and publishes
35 | the data in a queue every n commits
36 | """
37 |
38 | def connect_queue(self):
39 | """Establishes a connection to queue"""
40 | try:
41 | credentials = pika.PlainCredentials(
42 | self.queue['username'], self.queue['password'])
43 | self.con_parameters = pika.ConnectionParameters(self.queue['host'],
44 | self.queue['port'],
45 | self.queue['vhost'],
46 | credentials)
47 | connection = pika.BlockingConnection(
48 | self.con_parameters)
49 | channel = connection.channel()
50 |
51 | channel.queue_declare(queue=self.queue['queue'], durable=True)
52 | return connection, channel
53 | except Exception as e:
54 | raise e
55 |
56 | def send_index_data(self, data):
57 | """Indexes data"""
58 | try:
59 | connection, channel = self.connect_queue()
60 | channel.basic_publish(
61 | exchange='',
62 | routing_key=self.queue['queue'],
63 | body=json.dumps(data),
64 | properties=pika.BasicProperties(
65 | delivery_mode=2, # make message persistent
66 | ))
67 | connection.close()
68 | except Exception as e:
69 | raise e
70 |
--------------------------------------------------------------------------------
/graphrepo/drillers/stomp_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Default Parent class for drillers
15 | """
16 | import stomp
17 | import json
18 |
19 | from abc import abstractmethod
20 | from datetime import datetime
21 | from py2neo import Graph
22 | from pydriller import RepositoryMining
23 |
24 | import graphrepo.utils as utl
25 | from graphrepo.config import Config
26 | from graphrepo.drillers.queue_driller import QueueDriller
27 | import graphrepo.drillers.batch_utils as b_utl
28 | from graphrepo.logger import Logger
29 |
30 | LG = Logger()
31 |
32 |
33 | class StompDriller(QueueDriller):
34 | """StompDriller class - parses a git repo and publishes
35 | the data in a queue every n commits
36 | """
37 |
38 | def connect_queue(self):
39 | """Establishes a connection to queue"""
40 | try:
41 | conn = stomp.Connection(
42 | [(self.queue['host'], self.queue['port'])
43 | ], vhost=self.queue['vhost'], heartbeats=(10000, 10000)
44 | )
45 |
46 | conn.connect(self.queue['username'],
47 | self.queue['password'], wait=True)
48 | return conn
49 | except Exception as e:
50 | raise e
51 |
52 | def send_index_data(self, data):
53 | """Indexes data"""
54 | try:
55 | conn = self.connect_queue()
56 | conn.send(body=json.dumps(data), destination=self.queue.queue)
57 | conn.disconnect()
58 | except Exception as e:
59 | raise e
60 |
--------------------------------------------------------------------------------
/graphrepo/logger.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Logger"""
15 | from graphrepo.singleton import Singleton
16 |
17 |
18 | class Logger(metaclass=Singleton):
19 | def __init__(self, *args, **kwargs):
20 | """Default init"""
21 |
22 | def log(self, exception):
23 | """Logs exceptions and prints it to console
24 | :param exception: Exception type from Python
25 | """
26 | print('[EXCEPTION]: {}'.format(exception))
27 |
28 | def log_and_raise(self, exception):
29 | """Logs, prints and raises exception
30 | :param exception: Python Exception object
31 | """
32 | self.log(exception)
33 | raise exception
34 |
--------------------------------------------------------------------------------
/graphrepo/mappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .csv import CSVMapper
2 | from .default import DefaultMapper
3 |
--------------------------------------------------------------------------------
/graphrepo/mappers/csv.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom miner class with some abstractions"""
15 | from abc import abstractmethod
16 | import pandas as pd
17 |
18 | from graphrepo.mappers.default import DefaultMapper
19 |
20 |
21 | class CSVMapper(DefaultMapper):
22 | """The miners are currently synchronous, but
23 | ideally they will be async in the future"""
24 |
25 | def map(self, objects):
26 | """The csv default map function
27 | assumes the objectss are of the type
28 |
29 | """
30 | return pd.DataFrame(objects)
31 |
--------------------------------------------------------------------------------
/graphrepo/mappers/default.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom mapper class with some abstractions"""
15 | from abc import abstractmethod
16 |
17 |
18 | class DefaultMapper():
19 | """The miners are currently synchronous, but
20 | ideally they will be async in the future"""
21 |
22 | def __init__(self, *args, **kwargs):
23 | pass
24 |
--------------------------------------------------------------------------------
/graphrepo/miners/__init__.py:
--------------------------------------------------------------------------------
1 | from .commit import CommitMiner
2 | from .default import DefaultMiner
3 | from .developer import DeveloperMiner
4 | from .file import FileMiner
5 | from .mine_manager import MineManager
6 | from .method import MethodMiner
7 |
--------------------------------------------------------------------------------
/graphrepo/miners/commit.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines commits and contains all related Neo4j queries"""
15 |
16 | from graphrepo.miners.default import DefaultMiner
17 | from graphrepo.miners.utils import format_commit_id_date
18 |
19 |
20 | class CommitMiner(DefaultMiner):
21 | """This class holds queries for commits"""
22 |
23 | def query(self, **kwargs):
24 | """Queries commits by any arguments given in kwargs
25 | For example kwargs can be {'hash': 'example-hash'}
26 | :param kwargs: any parameter and value, between hash, name or email
27 | :returns: list of commit nodes matched
28 | """
29 | com_ = self.node_matcher.match("Commit", **kwargs)
30 | return [dict(x) for x in com_]
31 |
32 | def get_between_dates(self, start_date, end_date,
33 | project_id=None):
34 | """Returns all commits between start and end date
35 | :param start_date: timestamp, start date
36 | :param end_date: timestamp, end date
37 | :param project_id: optional; if given only the commits from a project
38 | are returned
39 | :returns: list of commitss
40 | """
41 | com_filter, where = format_commit_id_date(
42 | project_id, start_date, end_date)
43 | query = """
44 | MATCH (c: Commit {0})
45 | {1}
46 | RETURN distinct c
47 | """.format(com_filter, where)
48 | dt_ = self.graph.run(query)
49 | return [dict(x['c']) for x in dt_.data()]
50 |
51 | def get_all(self,):
52 | """Returns all commits
53 | :returns: list of commit nodes
54 | """
55 | com_ = self.node_matcher.match("Commit")
56 | return [dict(x) for x in com_]
57 |
58 | def get_commit_files(self, commit_hash):
59 | """Returns the files updated in a commit
60 | :param commit_hash: optional; if given, it will
61 | return the data only for one commit
62 | :returns: list of commit files
63 | """
64 | query = """
65 | MATCH (c:Commit {{hash: "{0}"}})
66 | -[UpdateFile]->(f:File)
67 | return distinct f
68 | """.format(commit_hash)
69 | files_ = self.graph.run(query)
70 | return [x['f'] for x in files_.data()]
71 |
72 | def get_commit_file_updates(self, commit_hash):
73 | """Returns the updates a commit made to files (UpdateFile rel)
74 | :param commit_hash: optional; if given, it will
75 | return the data only for one commit
76 | :returns: list of
77 | """
78 | query = """
79 | MATCH (c:Commit {{hash: "{0}"}})
80 | -[f: UpdateFile]->(fu:File)
81 | return distinct f
82 | """.format(commit_hash)
83 | files_ = self.graph.run(query)
84 | return [x['f'] for x in files_.data()]
85 |
86 | def get_commit_methods(self, commit_hash=None):
87 | """Returns the methods updated in a commit
88 | :param commit_hash: optional; if given, it will
89 | return the data only for one commit
90 | """
91 | query = """
92 | MATCH (c:Commit {{hash: "{0}"}})
93 | -[UpdateMethod]->(m:Method)
94 | return distinct m
95 | """.format(commit_hash)
96 | files_ = self.graph.run(query)
97 | return [x['m'] for x in files_.data()]
98 |
99 | def get_commit_method_updates(self, commit_hash=None):
100 | """Returns the updatemethod relationships from a commit
101 | :param commit_hash: optional; if given,
102 | it will return the data only for one commit
103 | :param dic: optional, boolean for ocnverting the data to dictionaries
104 | """
105 | query = """
106 | MATCH (c:Commit {{hash: "{0}"}})
107 | -[m:UpdateMethod]->(mu:Method)
108 | return distinct m
109 | """.format(commit_hash)
110 | files_ = self.graph.run(query)
111 | return [x['m'] for x in files_.data()]
112 |
--------------------------------------------------------------------------------
/graphrepo/miners/default.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module is a custom miner class with some abstractions"""
15 | from abc import abstractmethod
16 |
17 |
18 | class DefaultMiner():
19 | """The miners are currently synchronous, but
20 | ideally they will be async in the future"""
21 |
22 | def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs):
23 | self.graph = graph
24 | self.node_matcher = node_matcher
25 | self.rel_matcher = rel_matcher
26 |
27 | @abstractmethod
28 | def get_all(self):
29 | """This method returns all artifacts
30 | found by a miner"""
31 | raise NotImplementedError
32 |
--------------------------------------------------------------------------------
/graphrepo/miners/developer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines developers and contains all related Neo4j queries"""
15 |
16 | from graphrepo.miners.default import DefaultMiner
17 | from graphrepo.miners.utils import format_commit_id_date as fcid
18 |
19 |
20 | class DeveloperMiner(DefaultMiner):
21 | """This class holds queries for the Developer nodes"""
22 |
23 | def query(self, **kwargs):
24 | """Queries developers by any arguments given in kwargs
25 | For example kwargs can be {'hash': 'example-hash'} or
26 | {'email': 'example-email'}
27 | :param kwargs: any parameter and value, between hash, name or email
28 | :returns: list of nodes matched
29 | """
30 | return self.node_matcher.match("Developer", **kwargs)
31 |
32 | def get_commits(self, dev_hash, project_id=None,
33 | start_date=None, end_date=None):
34 | """Returns all commits authored by a developer.
35 | Optionally, it also filters by project id
36 | :param dev_hash: developer unique identifier
37 | :param project_id: optional; if present the
38 | query returns the commits from a project
39 | :param start_date: optional timestamp; filter commits
40 | beginning with this date
41 | :param end_date: optional timestamp; filter commits
42 | untill this date
43 | :returns: list of commits
44 | """
45 | com_filter, where = fcid(project_id,
46 | start_date, end_date)
47 | cquery = """
48 | MATCH (d:Developer {{hash: "{0}"}})
49 | -[r:Author]->
50 | (c:Commit {1})
51 | {2}
52 | RETURN distinct c;
53 | """.format(dev_hash, com_filter, where)
54 | dt_ = self.graph.run(cquery)
55 | return [dict(x['c']) for x in dt_.data()]
56 |
57 | def get_files(self, dev_hash, project_id=None,
58 | start_date=None, end_date=None):
59 | """Returns all files edited by a developer.
60 | Optionally it also filters by project_id
61 | :params dev_hash: developer unique identifier
62 | :params project_id: optional; if present the query
63 | returns the files from a specific project
64 | :param start_date: optional timestamp; filter files
65 | beginning with this date
66 | :param end_date: optional timestamp; filter files
67 | untill this date
68 | :returns: list of files
69 | """
70 | com_filter, where = fcid(project_id,
71 | start_date, end_date)
72 | fquery = """
73 | MATCH (d:Developer {{hash: "{0}"}})
74 | -[r:Author]->
75 | (c:Commit {1})
76 | -[UpdateFile]->
77 | (f: File)
78 | {2}
79 | RETURN collect(distinct f);
80 | """.format(dev_hash, com_filter, where)
81 | dt_ = self.graph.run(fquery)
82 | return [dict(x) for x in dt_.data()[0]['collect(distinct f)']]
83 |
84 | def get_files_updates(self, dev_hash, project_id=None,
85 | start_date=None, end_date=None):
86 | """Returns all file update information (e.g. file complexity),
87 | for all files edited by a developer.
88 | Optionally it also filters by project_id
89 | :params dev_hash: developer unique identifier
90 | :params project_id: optional; if present the query
91 | returns the files from a specific project
92 | :param start_date: optional timestamp; filter files
93 | beginning with this date
94 | :param end_date: optional timestamp; filter files
95 | untill this date
96 | :returns: list of file updates
97 | """
98 | com_filter, where = fcid(project_id,
99 | start_date, end_date)
100 | fuquery = """
101 | MATCH (d:Developer {{hash: "{0}"}})
102 | -[r:Author]->
103 | (c:Commit {1})
104 | -[fu: UpdateFile]->
105 | (f: File)
106 | {2}
107 | RETURN distinct fu;
108 | """.format(dev_hash, com_filter, where)
109 |
110 | dt_ = self.graph.run(fuquery)
111 | return [dict(x['fu']) for x in dt_.data()]
112 |
113 | def get_methods(self, dev_hash, project_id=None,
114 | start_date=None, end_date=None):
115 | """Returns all methods updated by a developer.
116 | Optionally it also filters by project_id
117 | :params dev_hash: developer unique identifier
118 | :params project_id: optional; if present the query
119 | returns the files from a specific project
120 | :param start_date: optional timestamp; filter files
121 | beginning with this date
122 | :param end_date: optional timestamp; filter files
123 | untill this date
124 | :returns: list of methods
125 | """
126 | com_filter, where = fcid(project_id,
127 | start_date, end_date)
128 | mquery = """
129 | MATCH (d:Developer {{hash: "{0}"}})
130 | -[r:Author]->
131 | (c:Commit {1})
132 | -[um: UpdateMethod]->
133 | (m: Method)
134 | {2}
135 | RETURN distinct m;
136 | """.format(dev_hash, com_filter, where)
137 |
138 | dt_ = self.graph.run(mquery)
139 | return [dict(x['m']) for x in dt_.data()]
140 |
141 | def get_method_updates(self, dev_hash, project_id=None,
142 | start_date=None, end_date=None):
143 | """Returns all method update information, for all
144 | methods update by a developer.
145 | Optionally it also filters by project_id
146 | :params dev_hash: developer unique identifier
147 | :params project_id: optional; if present the query
148 | returns the files from a specific project
149 | :param start_date: optional timestamp; filter files
150 | beginning with this date
151 | :param end_date: optional timestamp; filter files
152 | untill this date
153 | :returns: list of method updates
154 | """
155 | com_filter, where = fcid(project_id,
156 | start_date, end_date)
157 | muquery = """
158 | MATCH (d:Developer {{hash: "{0}"}})
159 | -[r:Author]->
160 | (c:Commit {1})
161 | -[um: UpdateMethod]->
162 | ()
163 | {2}
164 | RETURN distinct um;
165 | """.format(dev_hash, com_filter, where)
166 |
167 | dt_ = self.graph.run(muquery)
168 | return [dict(x['um']) for x in dt_.data()]
169 |
170 | def get_all(self):
171 | return self.node_matcher.match("Developer")
172 |
--------------------------------------------------------------------------------
/graphrepo/miners/file.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines files and contains all related Neo4j queries"""
15 |
16 | from graphrepo.miners.default import DefaultMiner
17 |
18 |
19 | class FileMiner(DefaultMiner):
20 | """This clas holds queries for the File nodes"""
21 |
22 | def query(self, **kwargs):
23 | """Searches for a file using the arguments in kwargs.
24 | If no kwargs are given it returns the first file found
25 | """
26 | return self.node_matcher.match("File", **kwargs).first()
27 |
28 | def get_all(self):
29 | """Returns all node of type File
30 | :return: list of files
31 | """
32 | return self.node_matcher.match("File")
33 |
34 | def get_change_history(self, file_hash):
35 | """Returns all updated relationships
36 | :param file_hash: a string, unique identifier for file
37 | :param dic: optional; boolean for converting data to dictionary
38 | or returning it as py2neo records - the py2neo raw
39 | records can be used in mappers
40 | :return: list of update file relationships
41 | """
42 | query = """MATCH ()-[r:UpdateFile]->(f:File {{hash: "{0}"}})
43 | return distinct r
44 | """.format(file_hash)
45 | dt_ = self.graph.run(query)
46 | return [dict(x['r']) for x in dt_.data()]
47 |
48 | def get_current_methods(self, file_hash):
49 | """Returns all current methods
50 | :param file_hash: a string, unique identifier for file
51 | :param dic: optional; boolean for converting data to dictionary
52 | or returning it as py2neo records - the py2neo raw
53 | records can be used in mappers
54 | :return: list of methods
55 | """
56 | query = """MATCH (f:File {{hash: "{0}"}})-[r:Method]->(m:Method)
57 | return distinct m
58 | """.format(file_hash)
59 | dt_ = self.graph.run(query)
60 | return [dict(x['m']) for x in dt_.data()]
61 |
62 | def get_past_methods(self, file):
63 | """Returns methods that were removed from the file
64 | :param file: Py2Neo File object
65 | :returrn: list of Method objects
66 | """
67 | # return [rel.end_node
68 | # for rel in self.graph.match([file, None], "HadMethod")]
69 |
--------------------------------------------------------------------------------
/graphrepo/miners/method.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module mines files and contains all related Neo4j queries"""
15 |
16 | from graphrepo.miners.default import DefaultMiner
17 |
18 |
19 | class MethodMiner(DefaultMiner):
20 | def __init__(self, graph, node_matcher, rel_matcher, *args, **kwargs):
21 | super().__init__(graph, node_matcher, rel_matcher, *args, **kwargs)
22 |
23 | def query(self, **kwargs):
24 | """Searches for a method using the arguments in kwargs.
25 | If no kwargs are given it returns the first method found
26 | """
27 | return self.node_matcher.match("Method", **kwargs).first()
28 |
29 | def get_all(self):
30 | """Returns all node of type Method
31 | :return: list of method
32 | """
33 | return self.node_matcher.match("Method")
34 |
35 | def get_change_history(self, method_hash):
36 | """Returns all UpdateMethod relationships
37 | :param method_hash: method unique identifier
38 | :param dic: optional; boolean for converting data to dictionary
39 | or returning it as py2neo records - the py2neo raw
40 | records can be used in mappers
41 | :return: list of UpdateMethod relationships / dics
42 | """
43 | query = """MATCH ()-[r:UpdateMethod]->(m: Method{{hash: "{0}"}})
44 | RETURN distinct r
45 | """.format(method_hash)
46 | dt_ = self.graph.run(query)
47 | return [dict(x['r']) for x in dt_.data()]
48 |
--------------------------------------------------------------------------------
/graphrepo/miners/mine_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """This module initializes and configures all miners"""
15 | from py2neo import Graph, NodeMatcher, RelationshipMatcher
16 | import graphrepo.utils as utl
17 | from graphrepo.config import Config
18 | from graphrepo.logger import Logger
19 | from graphrepo.singleton import Singleton
20 | from graphrepo import miners
21 |
22 |
23 | LG = Logger()
24 |
25 |
26 | class MineManager(metaclass=Singleton):
27 | """MineManageer class - This class manages custom
28 | miners. At the moment we instantiate all miners,
29 | but other managers which handle different 'teams of miners'
30 | can be created.
31 | """
32 |
33 | def __init__(self, config_path):
34 | """Initializes the properties of this class"""
35 | self.commit_miner, self.dev_miner, \
36 | self.file_miner, self.method_miner = None, None, None, None
37 | try:
38 | if not config_path:
39 | raise FileNotFoundError
40 | neo, project = utl.parse_config(config_path)
41 | self.config = Config()
42 | self.config.configure(**neo, **project)
43 | self.graph = None
44 | self.node_matcher = None
45 | self.rel_matcher = None
46 | self.connect()
47 | except Exception as exc:
48 | LG.log_and_raise(exc)
49 |
50 | def connect(self):
51 | """Instantiates the connection to Neo4j and stores
52 | the graph internally.
53 | Throws exception if the connection can not pe realized
54 | """
55 | try:
56 | self.graph = Graph(host=self.config.ct.db_url,
57 | user=self.config.ct.db_user,
58 | password=self.config.ct.db_pwd,
59 | http_port=self.config.ct.port)
60 | self.node_matcher = NodeMatcher(self.graph)
61 | self.rel_matcher = RelationshipMatcher(self.graph)
62 | self.init_miners()
63 | except Exception as exc:
64 | LG.log_and_raise(exc)
65 |
66 | def check_connection(self):
67 | """Checks if there is a db connection and raises
68 | ReferenceError if not.
69 | """
70 | try:
71 | self.connect()
72 | except:
73 | raise ReferenceError("There is no valid "
74 | "database connection. Please "
75 | "configure and connect first.")
76 |
77 | def init_miners(self):
78 | """Initializes all miners"""
79 | try:
80 | # TODO: Parse this automatically?
81 | self.commit_miner = miners.CommitMiner(
82 | graph=self.graph,
83 | node_matcher=self.node_matcher,
84 | rel_matcher=self.rel_matcher)
85 | self.dev_miner = \
86 | miners.DeveloperMiner(graph=self.graph,
87 | node_matcher=self.node_matcher,
88 | rel_matcher=self.rel_matcher)
89 | self.file_miner = \
90 | miners.FileMiner(graph=self.graph,
91 | node_matcher=self.node_matcher,
92 | rel_matcher=self.rel_matcher)
93 | self.method_miner = \
94 | miners.MethodMiner(graph=self.graph,
95 | node_matcher=self.node_matcher,
96 | rel_matcher=self.rel_matcher)
97 |
98 | except Exception as exc:
99 | LG.log_and_raise(exc)
100 | else:
101 | return
102 |
103 | def get_all_data(self):
104 | """Returns all nodes and relationships from Neo4j
105 | :returns: a tuple with two arrays: the first with nodes,
106 | the second with relationships
107 | """
108 | nodes = self.node_matcher.match()
109 | rels = self.rel_matcher.match()
110 |
111 | return list(nodes), list(rels)
112 |
--------------------------------------------------------------------------------
/graphrepo/miners/utils.py:
--------------------------------------------------------------------------------
1 | """Utils methods for miners"""
2 |
3 |
4 | def format_commit_id_date(project_id, start_date, end_date, commit_hash=None):
5 | """Formats commit query with id and dates
6 | :param project_id: the project unique identifier
7 | :param start_date: timestamp, commit start_date
8 | :param end_date: timestamp, commit end_date
9 | :param ccommit_hash: optional, if given the query
10 | filters by commit hash
11 | :returns: query filter string and where clause
12 | """
13 | com_filter, where = "", ""
14 | if project_id and not commit_hash:
15 | com_filter += """{{project_id: "{0}"}}""".format(project_id)
16 | if project_id and commit_hash:
17 | com_filter += """{{project_id: "{0}", hash: {1}}}""".format(
18 | project_id, commit_hash)
19 | if start_date:
20 | where += "c.timestamp >= {0}".format(start_date)
21 | if end_date:
22 | where += " AND " if where else ""
23 | where += "c.timestamp <= {0}".format(end_date)
24 | where = "WHERE " + where if where else where
25 |
26 | return com_filter, where
27 |
--------------------------------------------------------------------------------
/graphrepo/singleton.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Singleton metaclass"""
16 |
17 |
18 | class Singleton(type):
19 | """
20 | Define an Instance operation that lets clients access its unique
21 | instance.
22 | """
23 |
24 | def __init__(cls, name, bases, attrs, *args, **kwargs):
25 | super().__init__(name, bases, attrs)
26 | cls._instance = None
27 |
28 | def __call__(cls, *args, **kwargs):
29 | if cls._instance is None:
30 | cls._instance = super().__call__(*args, **kwargs)
31 | return cls._instance
32 |
--------------------------------------------------------------------------------
/graphrepo/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 GraphRepo
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utils methods for GraphRepo"""
15 | import json
16 | import hashlib
17 | from datetime import datetime
18 | import yaml
19 |
20 |
21 | class Dotdict(dict):
22 | """dot.notation access to dictionary attributes"""
23 | __getattr__ = dict.get
24 | __setattr__ = dict.__setitem__
25 | __delattr__ = dict.__delitem__
26 |
27 |
28 | def parse_config(path):
29 | with open(path, 'r') as ymlfile:
30 | conf = yaml.load(ymlfile, Loader=yaml.FullLoader)
31 |
32 | neo = conf['neo']
33 | project = conf['project']
34 |
35 | project['start_date'] = datetime.strptime(
36 | project['start_date'], '%d %B, %Y %H:%M') \
37 | if project['start_date'] else None
38 | project['end_date'] = datetime.strptime(
39 | project['end_date'], '%d %B, %Y %H:%M') \
40 | if project['end_date'] else None
41 |
42 | return neo, project
43 |
44 |
45 | def save_json(path, data):
46 | with open(path, 'w') as outfile:
47 | json.dump(data, outfile)
48 |
49 |
50 | def load_json(path):
51 | with open(path) as json_file:
52 | return json.load(json_file)
53 |
54 |
55 | def get_file_hash(file, project_id=None, use_new_path=False):
56 | name = ''
57 | if not file.old_path and file.new_path:
58 | # ADD File
59 | name = name + file.new_path
60 | elif file.old_path and not file.new_path:
61 | # DELETE
62 | name = name+file.old_path
63 | elif file.old_path and file.new_path:
64 | # MODIFY OR RENAME
65 | if use_new_path:
66 | name = name + file.new_path
67 | else:
68 | name = name + file.old_path
69 |
70 | name = name+file.filename
71 | name = project_id + name if project_id else name
72 | return hashlib.sha224(str(name).encode('utf-8')).hexdigest()
73 |
74 |
75 | def get_method_type(method, m_before, m_current):
76 | if method.name in m_before and method.name not in m_current:
77 | return "DELETE"
78 | elif method.name in m_before and method.name in m_current:
79 | return "MODIFY"
80 | else:
81 | return "ADD"
82 |
83 |
84 | def get_method_hash(method, file, project_id=None):
85 | fhash = get_file_hash(file, project_id)
86 | _fmname = fhash + "_" + method.name
87 | _fmname = project_id + _fmname if project_id else _fmname
88 | return hashlib.sha224(_fmname.encode('utf-8')).hexdigest()
89 |
90 |
91 | def get_author_hash(email):
92 | return hashlib.sha224(email.encode('utf-8')).hexdigest()
93 |
94 |
95 | def format_dev(dev, index_email=True):
96 | return {
97 | 'name': dev.author.name,
98 | 'email': dev.author.email if index_email else '',
99 | 'hash': get_author_hash(dev.author.email)
100 | }
101 |
102 |
103 | def get_commit_hash(chash, project_id):
104 | return hashlib.sha224(str(project_id + chash).encode('utf-8')).hexdigest()
105 |
106 |
107 | def format_commit(com, project_id):
108 | return {
109 | 'hash': get_commit_hash(com.hash, project_id),
110 | 'commit_hash': com.hash,
111 | 'message': com.msg,
112 | 'is_merge': 1 if com.merge else 0,
113 | 'timestamp': com.author_date.timestamp(),
114 | 'project_id': project_id,
115 | 'dmm_unit_complexity': com.dmm_unit_complexity if com.dmm_unit_complexity else -1,
116 | 'dmm_unit_interfacing': com.dmm_unit_interfacing if com.dmm_unit_interfacing else -1,
117 | 'dmm_unit_size': com.dmm_unit_size if com.dmm_unit_size else -1,
118 | }
119 |
120 |
121 | def format_parent_commit(c_hash, parent_hash, project_id=None):
122 | return {
123 | 'child_hash': c_hash,
124 | 'parent_hash': get_commit_hash(parent_hash, project_id)
125 | }
126 |
127 |
128 | def format_branch(name, project_id):
129 | return {
130 | 'hash': hashlib.sha224(str(project_id+name).encode('utf-8')).hexdigest(),
131 | 'project_id': project_id,
132 | 'name': name
133 | }
134 |
135 |
136 | def format_author_commit(dev, com, timestamp):
137 | return {'commit_hash': com['hash'],
138 | 'author_hash': dev['hash'],
139 | 'timestamp': timestamp,
140 | }
141 |
142 |
143 | def format_branch_commit(bhash, chash):
144 | return {'branch_hash': bhash,
145 | 'commit_hash': chash
146 | }
147 |
148 |
149 | def format_file(file, project_id):
150 | return {
151 | 'hash': get_file_hash(file, project_id),
152 | 'merge_hash': get_file_hash(file, project_id, use_new_path=True),
153 | 'name': file.filename,
154 | 'project_id': project_id,
155 | 'type': '.' + file.filename.split('.')[-1:][0]
156 | }
157 |
158 |
159 | def format_commit_file(c_hash, file, timestamp, project_id, index_code=True):
160 | f_hash = get_file_hash(file, project_id)
161 | f_merge_hash = get_file_hash(file, project_id, use_new_path=True)
162 | dt_ = {'commit_hash': c_hash, 'file_hash': f_hash,
163 | 'attributes': {
164 | 'timestamp': timestamp,
165 | 'old_path': file.old_path if file.old_path else '',
166 | 'path': file.new_path if file.new_path else '',
167 | 'source_code': '',
168 | 'source_code_before': '',
169 | 'diff': file.diff,
170 | 'nloc': file.nloc if file.nloc else -1,
171 | 'complexity': file.complexity if file.complexity else -1,
172 | 'token_count': file.token_count if file.token_count else -1,
173 | 'added': file.added,
174 | 'removed': file.removed,
175 | 'type': file.change_type.name,
176 | 'f_hash': f_hash,
177 | 'm_hash': f_merge_hash}}
178 |
179 | if index_code:
180 | dt_['attributes']['source_code'] = str(
181 | file.source_code) if file.source_code else '',
182 | dt_['attributes']['source_code_before'] = str(
183 | file.source_code_before) if file.source_code_before else '',
184 |
185 | return dt_
186 |
187 |
188 | def format_commit_method(c_hash, m_hash, met, timestamp):
189 | return {
190 | 'commit_hash': c_hash,
191 | 'method_hash': m_hash,
192 | 'attributes': {
193 | 'timestamp': timestamp,
194 | 'long_name': met.long_name,
195 | 'parameters': met.parameters,
196 | 'complexity': met.complexity,
197 | 'nloc': met.nloc,
198 | 'fan_in': met.fan_in,
199 | 'fan_out': met.fan_out,
200 | 'general_fan_out': met.general_fan_out,
201 | 'length': met.length,
202 | 'token_count': met.token_count,
203 | 'start_line': met.start_line,
204 | 'end_line': met.end_line}}
205 |
206 |
207 | def format_method(met, fille, project_id):
208 | return {
209 | 'hash': get_method_hash(met, fille, project_id),
210 | 'name': met.name,
211 | 'file_name': met.filename,
212 | 'project_id': project_id}
213 |
214 |
215 | def format_file_method(f_hash, m_hash):
216 | return {'file_hash': f_hash, 'method_hash': m_hash}
217 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lizard==1.16.6
2 | pytz==2018.9
3 | psutil==5.7.0
4 | py2neo==4.3.0
5 | pydriller==1.15.1
6 | requests==2.21.0
7 | pytest==5.3.5
8 | GitPython==3.1.0
9 | PyYAML==5.3.1
10 | diskcache==4.1.0
11 | pika==1.1.0
12 | stomp.py==6.1.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # v0.3.5 released
2 | from setuptools import setup, find_packages
3 |
4 | with open('requirements.txt') as reqs_file:
5 | requirements = reqs_file.read().splitlines()
6 |
7 | setup(name="graphrepo",
8 | version="0.3.5",
9 | description="A tool that maps a Github repo to Neo4j and Helps Mining the Repo in the DB",
10 | url="https://github.com/NullConvergence/GraphRepo",
11 | license='Apache License',
12 | python_requires='>=3.5',
13 | install_requires=requirements,
14 | packages=find_packages('.'),
15 | package_dir={'graphrepo': 'graphrepo'})
16 |
17 | # python3 setup.py sdist bdist_wheel
18 | # python3 -m twine upload dist/*
19 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/__init__.py
--------------------------------------------------------------------------------
/tests/cnfg_init.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 200
7 |
8 | project:
9 | repo: tests/gr-test
10 | start_date: "14 May, 2020 00:00"
11 | end_date: "15 May, 2020 23:00"
12 | project_id: 'graph_repo_test'
13 | index_code: False
14 | index_developer_email: True
--------------------------------------------------------------------------------
/tests/cnfg_simple.yml:
--------------------------------------------------------------------------------
1 | neo:
2 | db_url: localhost
3 | port: 7687
4 | db_user: neo4j
5 | db_pwd: neo4jj
6 | batch_size: 200
7 |
8 | project:
9 | repo: tests/gr-test
10 | start_date: "14 May, 2020 00:00"
11 | end_date: "15 May, 2020 02:00"
12 | project_id: 'graph_repo_test'
13 | index_code: True
14 | index_developer_email: True
--------------------------------------------------------------------------------
/tests/test_cache_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from graphrepo.drillers.cache_driller import CacheDriller
18 |
19 |
20 | class TestCacheDriller:
21 | def test_indexing(self):
22 | folder = os.path.dirname(os.path.abspath(__file__))
23 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml'))
24 | test_driller.drill_batch_cache_sequential()
25 | records = [r for r in test_driller.graph.run(
26 | "MATCH(n) RETURN n")]
27 | assert len(records) == 22
28 |
29 | test_driller.clean()
30 |
31 | def test_drill_batch_cache(self):
32 | folder = os.path.dirname(os.path.abspath(__file__))
33 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_init.yml'))
34 | test_driller.drill_batch_cache_all()
35 | records = [r for r in test_driller.graph.run(
36 | "MATCH(n) RETURN n")]
37 | assert len(records) == 22
38 |
39 | test_driller.clean()
40 |
--------------------------------------------------------------------------------
/tests/test_commit.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import pytest
17 | import yaml
18 |
19 | from py2neo import NodeMatcher, RelationshipMatcher
20 | from graphrepo.drillers.driller import Driller
21 | from graphrepo.drillers.cache_driller import CacheDriller
22 |
23 |
24 | class TestCommit:
25 | """Most data is indexed when indexing a commmit
26 | so this class tests indexing for multiple models"""
27 |
28 | def test_nodes_index(self):
29 | folder = os.path.dirname(os.path.abspath(__file__))
30 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
31 | test_driller.drill_batch()
32 |
33 | # test that all nodes were indexed
34 | node_matcher = NodeMatcher(test_driller.graph)
35 | all_commits = list(node_matcher.match("Commit"))
36 | assert len(all_commits) == 8
37 |
38 | all_devs = list(node_matcher.match("Developer"))
39 | assert len(all_devs) == 2
40 |
41 | all_files = list(node_matcher.match("File"))
42 | assert len(all_files) == 6
43 |
44 | all_methods = list(node_matcher.match("Method"))
45 | assert len(all_methods) == 5
46 |
47 | all_branches = list(node_matcher.match("Branch"))
48 | assert len(all_branches) == 1
49 |
50 | test_driller.clean()
51 |
52 | def test_rel_index(self):
53 | folder = os.path.dirname(os.path.abspath(__file__))
54 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
55 | test_driller.drill_batch()
56 |
57 | # test that all relationships were indexed
58 | rel_matcher = RelationshipMatcher(test_driller.graph)
59 |
60 | all_branch = list(rel_matcher.match(None, "BranchCommit"))
61 | assert len(all_branch) == 8
62 |
63 | all_authorship = list(rel_matcher.match(None, "Author"))
64 | assert len(all_authorship) == 8
65 |
66 | all_parent = list(rel_matcher.match(None, "Parent"))
67 | assert len(all_parent) == 8
68 |
69 | all_updadedfile = list(rel_matcher.match(None, "UpdateFile"))
70 | assert len(all_updadedfile) == 9
71 |
72 | all_hasmethod = list(rel_matcher.match(None, "Method"))
73 | assert len(all_hasmethod) == 5
74 |
75 | all_updatemethod = list(rel_matcher.match(None, "UpdateMethod"))
76 | assert len(all_updatemethod) == 9
77 |
78 | test_driller.clean()
79 |
80 | def test_rel_index_cache(self):
81 | folder = os.path.dirname(os.path.abspath(__file__))
82 | test_driller = CacheDriller(os.path.join(folder, 'cnfg_simple.yml'))
83 | test_driller.drill_batch_cache_sequential()
84 |
85 | # test that all relationships were indexed
86 | rel_matcher = RelationshipMatcher(test_driller.graph)
87 |
88 | all_branch = list(rel_matcher.match(None, "BranchCommit"))
89 | assert len(all_branch) == 8
90 |
91 | all_authorship = list(rel_matcher.match(None, "Author"))
92 | assert len(all_authorship) == 8
93 |
94 | all_parent = list(rel_matcher.match(None, "Parent"))
95 | assert len(all_parent) == 8
96 |
97 | all_updadedfile = list(rel_matcher.match(None, "UpdateFile"))
98 | assert len(all_updadedfile) == 9
99 |
100 | all_hasmethod = list(rel_matcher.match(None, "Method"))
101 | assert len(all_hasmethod) == 5
102 |
103 | all_updatemethod = list(rel_matcher.match(None, "UpdateMethod"))
104 | assert len(all_updatemethod) == 9
105 |
106 | test_driller.clean()
107 |
108 | def test_custom_attributes_rel(self):
109 | folder = os.path.dirname(os.path.abspath(__file__))
110 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
111 | test_driller.drill_batch()
112 |
113 | node_matcher = NodeMatcher(test_driller.graph)
114 | rel_matcher = RelationshipMatcher(test_driller.graph)
115 |
116 | commit = node_matcher.match(
117 | "Commit", hash="aa6fa504ccb0fa919acc3cb31e510dc2048314eb0656f34babada15c").first()
118 | assert commit['is_merge'] == 0
119 |
120 | update_file_rel = rel_matcher.match([commit], "UpdateFile").first()
121 | assert update_file_rel['complexity'] == 2
122 | assert update_file_rel['nloc'] == 8
123 | assert update_file_rel['old_path'] == 'gr_test/default_class.py'
124 | assert update_file_rel['path'] == 'gr_test/default_class.py'
125 | assert update_file_rel['token_count'] == 42
126 | assert update_file_rel['type'] == 'MODIFY'
127 | assert update_file_rel['removed'] == 6
128 | assert update_file_rel['added'] == 0
129 |
130 | update_method_rel = rel_matcher.match(
131 | [commit], 'UpdateMethod').first()
132 | # assert update_method_rel['type'] == 'DELETE'
133 | assert update_method_rel['nloc'] == 5
134 | assert update_method_rel['complexity'] == 2
135 | assert update_method_rel['token_count'] == 21
136 | assert update_method_rel['length'] == 5
137 | assert update_method_rel['fan_in'] == 0
138 | assert update_method_rel['fan_out'] == 0
139 | assert update_method_rel['start_line'] == 11
140 | assert update_method_rel['end_line'] == 15
141 |
142 | test_driller.clean()
143 |
--------------------------------------------------------------------------------
/tests/test_commit_miner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from datetime import datetime
15 | import os
16 |
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.commit import CommitMiner
20 |
21 |
22 | class TestCommitMiner:
23 | def test_gets(self):
24 | folder = os.path.dirname(os.path.abspath(__file__))
25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 | test_driller.drill_batch()
27 |
28 | st_date = datetime.strptime(
29 | '14 May, 2020 00:00', '%d %B, %Y %H:%M').timestamp()
30 | end_date = datetime.strptime(
31 | '15 May, 2020 02:00', '%d %B, %Y %H:%M').timestamp()
32 |
33 | n_matcher = NodeMatcher(test_driller.graph)
34 | r_matcher = RelationshipMatcher(test_driller.graph)
35 |
36 | com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher)
37 |
38 | all_com = com_miner.get_all()
39 | assert len(all_com) == 8
40 |
41 | all_com_dates = com_miner.get_between_dates(st_date, end_date)
42 | assert len(all_com_dates) == 8
43 |
44 | c_files = com_miner.get_commit_files(
45 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
46 | assert len(c_files) == 3
47 |
48 | c_file_updates = com_miner.get_commit_file_updates(
49 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
50 | assert len(c_file_updates) == 3
51 |
52 | c_methods = com_miner.get_commit_methods(
53 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
54 | assert len(c_methods) == 3
55 |
56 | c_method_updates = com_miner.get_commit_method_updates(
57 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
58 | assert len(c_method_updates) == 3
59 |
60 | test_driller.clean()
61 |
--------------------------------------------------------------------------------
/tests/test_csv_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers import Driller
19 | from graphrepo.mappers import CSVMapper
20 | from graphrepo.miners import CommitMiner
21 |
22 |
23 | class TestCSVMapper:
24 | """Most data is indexed when indexing a commmit
25 | so this class tests indexing for multiple models"""
26 |
27 | def test_csv_mapper(self):
28 | folder = os.path.dirname(os.path.abspath(__file__))
29 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
30 | test_driller.drill_batch()
31 |
32 | n_matcher = NodeMatcher(test_driller.graph)
33 | r_matcher = RelationshipMatcher(test_driller.graph)
34 |
35 | com_miner = CommitMiner(test_driller.graph, n_matcher, r_matcher)
36 | mapper = CSVMapper()
37 |
38 | commits = com_miner.get_all()
39 | mapped_commits = mapper.map(commits)
40 | assert mapped_commits.shape == (8, 9)
41 |
42 | c_files = com_miner.get_commit_files(
43 | 'ad98f8594c15b1ebc4be4f20d849bcc0edf69ec574c33dfd84b7792d')
44 | c_csv = mapper.map(c_files)
45 | assert c_csv.shape == (3, 5)
46 |
47 | test_driller.clean()
48 |
--------------------------------------------------------------------------------
/tests/test_db_init.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from graphrepo.drillers.default import DefaultDriller
4 | import graphrepo.drillers.db_init as db_init
5 |
6 | from py2neo.database import Schema
7 |
8 |
9 | class TestDBInit:
10 | def test_hash_constraints(self):
11 | folder = os.path.dirname(os.path.abspath(__file__))
12 | test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml'))
13 |
14 | db_init.create_hash_constraints(test_driller.graph)
15 |
16 | schm = Schema(test_driller.graph)
17 |
18 | labels = ["Developer", "Branch", "Commit", "File", "Method"]
19 |
20 | for l in labels:
21 | c = schm.get_uniqueness_constraints(l)
22 | assert len(c) == 1
23 |
24 | # clean
25 | for l in labels:
26 | schm.drop_uniqueness_constraint(l, 'hash')
27 |
28 | def test_indices(self):
29 | folder = os.path.dirname(os.path.abspath(__file__))
30 | test_driller = DefaultDriller(os.path.join(folder, 'cnfg_simple.yml'))
31 |
32 | db_init.create_indices(test_driller.graph, hash_index=True)
33 |
34 | schm = Schema(test_driller.graph)
35 |
36 | index_authors = schm.get_indexes("Developer")
37 | assert len(index_authors) == 1
38 |
39 | index_branch = schm.get_indexes("Branch")
40 | assert len(index_branch) == 2
41 |
42 | index_commits = schm.get_indexes("Commit")
43 | assert len(index_commits) == 2
44 |
45 | index_files = schm.get_indexes("File")
46 | assert len(index_files) == 3
47 |
48 | index_methods = schm.get_indexes("Method")
49 | assert len(index_methods) == 3
50 |
51 | # clean
52 | schm.drop_index("Developer", "hash")
53 | schm.drop_index("Branch", "hash")
54 | schm.drop_index("Branch", "project_id")
55 | schm.drop_index("Commit", "hash")
56 | schm.drop_index("Commit", "project_id")
57 | schm.drop_index("File", "hash")
58 | schm.drop_index("File", "project_id")
59 | schm.drop_index("Method", "hash")
60 | schm.drop_index("Method", "project_id")
61 |
--------------------------------------------------------------------------------
/tests/test_dev_miner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from datetime import datetime
16 | import os
17 |
18 | from py2neo import NodeMatcher, RelationshipMatcher
19 | from graphrepo.drillers.driller import Driller
20 | from graphrepo.miners.developer import DeveloperMiner
21 |
22 |
23 | class TestDevMiner:
24 | def test_gets(self):
25 | folder = os.path.dirname(os.path.abspath(__file__))
26 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
27 | test_driller.drill_batch()
28 |
29 | st_date = datetime.strptime(
30 | "14 May, 2020 00:00", '%d %B, %Y %H:%M').timestamp()
31 | end_date = datetime.strptime(
32 | "15 May, 2020 02:00", '%d %B, %Y %H:%M').timestamp()
33 |
34 | n_matcher = NodeMatcher(test_driller.graph)
35 | r_matcher = RelationshipMatcher(test_driller.graph)
36 |
37 | dev_miner = DeveloperMiner(test_driller.graph, n_matcher, r_matcher)
38 |
39 | all_devs = dev_miner.get_all()
40 | assert len(all_devs) == 2
41 |
42 | all_commits = dev_miner.get_commits(
43 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb")
44 | assert len(all_commits) == 7
45 |
46 | all_com_id = dev_miner.get_commits(
47 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
48 | project_id=test_driller.config.ct.project_id
49 | )
50 | assert len(all_com_id) == 7
51 |
52 | all_com_id_dates = dev_miner.get_commits(
53 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
54 | project_id=test_driller.config.ct.project_id,
55 | start_date=st_date,
56 | end_date=end_date
57 | )
58 | assert len(all_com_id_dates) == 7
59 |
60 | all_files = dev_miner.get_files(
61 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
62 | )
63 | assert len(all_files) == 6
64 |
65 | all_files_id_dates = dev_miner.get_files(
66 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
67 | project_id=test_driller.config.ct.project_id,
68 | start_date=st_date,
69 | end_date=end_date
70 | )
71 | assert len(all_files_id_dates) == 6
72 |
73 | files_updates = dev_miner.get_files_updates(
74 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
75 | )
76 | assert len(files_updates) == 9
77 |
78 | files_updates_id_dates = dev_miner.get_files_updates(
79 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
80 | project_id=test_driller.config.ct.project_id,
81 | start_date=st_date,
82 | end_date=end_date
83 | )
84 | assert len(files_updates_id_dates) == 9
85 |
86 | all_methods = dev_miner.get_methods(
87 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
88 | )
89 | assert len(all_methods) == 5
90 |
91 | all_methods_id_dates = dev_miner.get_methods(
92 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
93 | project_id=test_driller.config.ct.project_id,
94 | start_date=st_date,
95 | end_date=end_date
96 | )
97 | assert len(all_methods_id_dates) == 5
98 |
99 | method_updates = dev_miner.get_method_updates(
100 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb"
101 | )
102 | assert len(method_updates) == 9
103 |
104 | method_updates_id_dates = dev_miner.get_method_updates(
105 | dev_hash="bb1a1830d2f4f4d13151827aa1072ed43bd8738a139da332e1ee3ddb",
106 | project_id=test_driller.config.ct.project_id,
107 | start_date=st_date,
108 | end_date=end_date
109 | )
110 | assert len(method_updates_id_dates) == 9
111 |
112 | test_driller.clean()
113 |
--------------------------------------------------------------------------------
/tests/test_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from graphrepo.drillers.driller import Driller
18 |
19 |
20 | class TestDriller:
21 | def test_configure(self):
22 | folder = os.path.dirname(os.path.abspath(__file__))
23 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
24 |
25 | assert test_driller.config.ct.db_url == 'localhost'
26 | assert test_driller.config.ct.repo == 'tests/gr-test'
27 |
28 | assert test_driller.graph is not None
29 |
30 | def test_indexing(self):
31 | folder = os.path.dirname(os.path.abspath(__file__))
32 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
33 | test_driller.drill_batch()
34 | records = [r for r in test_driller.graph.run(
35 | "MATCH(n) RETURN n")]
36 | assert len(records) == 22
37 |
38 | test_driller.clean()
39 |
40 | def test_index_save(self):
41 | folder = os.path.dirname(os.path.abspath(__file__))
42 | test_driller = Driller(os.path.join(folder, 'cnfg_init.yml'))
43 | test_driller.drill_batch(save_path='data/graphrepo.json')
44 | records = [r for r in test_driller.graph.run(
45 | "MATCH(n) RETURN n")]
46 | assert len(records) == 22
47 |
48 | test_driller.clean()
49 |
50 | test_driller.index_from_file(file_path='data/graphrepo.json')
51 | records = [r for r in test_driller.graph.run(
52 | "MATCH(n) RETURN n")]
53 | assert len(records) == 22
54 |
55 | os.remove('data/graphrepo.json')
56 | test_driller.clean()
57 |
--------------------------------------------------------------------------------
/tests/test_file.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xserban/GraphRepo/ce625cd4efddc96f28835c541865ef11b46e4002/tests/test_file.py
--------------------------------------------------------------------------------
/tests/test_file_miner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.file import FileMiner
20 |
21 |
22 | class TestFileMiner:
23 | def test_get_all(self):
24 | folder = os.path.dirname(os.path.abspath(__file__))
25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 | test_driller.drill_batch()
27 |
28 | n_matcher = NodeMatcher(test_driller.graph)
29 | r_matcher = RelationshipMatcher(test_driller.graph)
30 |
31 | f_miner = FileMiner(test_driller.graph, n_matcher, r_matcher)
32 |
33 | all_files = f_miner.get_all()
34 | assert len(all_files) == 6
35 |
36 | # get readme file
37 | readme = f_miner.query(name='README.MD')
38 | assert readme['name'] == 'README.MD'
39 |
40 | # get file history
41 | f_hash = 'f85f4af5b20ddd617f93da13c7789a65fb972e68a8d634d5f253abab'
42 | update_history = f_miner.get_change_history(f_hash)
43 | assert len(update_history) == 3
44 |
45 | # test file get methods
46 | current_m = f_miner.get_current_methods(f_hash)
47 | assert len(current_m) == 2
48 |
49 | test_driller.clean()
50 |
--------------------------------------------------------------------------------
/tests/test_method_miner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from py2neo import NodeMatcher, RelationshipMatcher
18 | from graphrepo.drillers.driller import Driller
19 | from graphrepo.miners.method import MethodMiner
20 |
21 |
22 | class TestMethodMiner:
23 | def test_get_all(self):
24 | folder = os.path.dirname(os.path.abspath(__file__))
25 | test_driller = Driller(os.path.join(folder, 'cnfg_simple.yml'))
26 | test_driller.drill_batch()
27 |
28 | n_matcher = NodeMatcher(test_driller.graph)
29 | r_matcher = RelationshipMatcher(test_driller.graph)
30 |
31 | m_miner = MethodMiner(test_driller.graph, n_matcher, r_matcher)
32 |
33 | all_methods = m_miner.get_all()
34 | assert len(all_methods) == 5
35 | m_hash = '45ce8dcd8b0cd8ed42e592ce828ab6418e7c79713b8dc99805bcb7ea'
36 | met = m_miner.query(hash=m_hash)
37 | assert met['name'] == 'get_name'
38 |
39 | history = m_miner.get_change_history(m_hash)
40 | assert len(history) == 2
41 |
42 | test_driller.clean()
43 |
--------------------------------------------------------------------------------
/tests/test_queue_driller.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | from graphrepo.drillers.queue_driller import QueueDriller
18 |
19 |
20 | # class TestQueueDriller:
21 | # def test_indexing(self):
22 | # folder = os.path.dirname(os.path.abspath(__file__))
23 | # test_driller = QueueDriller(os.path.join(folder, 'cnfg_init.yml'))
24 | # test_driller.drill_batch()
25 | # records = [r for r in test_driller.graph.run(
26 | # "MATCH(n) RETURN n")]
27 | # assert len(records) == 22
28 |
29 | # test_driller.clean()
30 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 NullConvergence
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from graphrepo.utils import parse_config
17 |
18 |
19 | class TestUtils:
20 | def test_parse_config(self):
21 | folder = os.path.dirname(os.path.abspath(__file__))
22 | neo, project = parse_config(os.path.join(folder, 'cnfg_init.yml'))
23 | assert neo['db_url'] == 'localhost'
24 | assert neo['db_user'] == 'neo4j'
25 | assert project['repo'] == 'tests/gr-test'
26 |
--------------------------------------------------------------------------------