├── .flake8
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DCO
├── Dockerfile
├── LICENSE.md
├── MAINTAINERS
├── README.md
├── apollo
    ├── __init__.py
    ├── __main__.py
    ├── bags.py
    ├── cassandra_utils.py
    ├── graph.py
    ├── hasher.py
    ├── query.md.jinja2
    ├── query.py
    ├── report.md.jinja2
    └── warmup.py
├── doc
    ├── 101.md
    ├── GLOSSARY.md
    ├── README.md
    ├── SUMMARY.md
    ├── algorithm.md
    ├── cmd
    │   ├── bags.md
    │   ├── cc.md
    │   ├── cmd.md
    │   ├── db.md
    │   ├── dumpcc.md
    │   ├── dumpcmd.md
    │   ├── evalcc.md
    │   ├── features.md
    │   ├── hash.md
    │   ├── preprocess.md
    │   ├── query.md
    │   └── resetdb.md
    ├── gemini.md
    ├── install
    │   ├── README.md
    │   ├── db.md
    │   ├── docker.md
    │   └── pip.md
    └── model
    │   ├── cc.md
    │   ├── cmd.md
    │   └── wmh.md
├── docker-compose.yml
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── test_graph_CommunityDetector.py
    ├── test_graph_ConnectedComponents.py
    └── test_modify_feature_weights.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=99
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | _book
  2 | bundle
  3 | *.asdf
  4 | 
  5 | #Mac OS
  6 | *.DS_Store
  7 | 
  8 | #PyCharm IDE
  9 | .idea/
 10 | 
 11 | # Documentation build files
 12 | doc/_build/
 13 | doc/ast2vec.rst
 14 | doc/modules.rst
 15 | 
 16 | # Byte-compiled / optimized / DLL files
 17 | __pycache__/
 18 | *.py[cod]
 19 | *$py.class
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | env/
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | .hypothesis/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # dotenv
 98 | .env
 99 | 
100 | # virtualenv
101 | .venv
102 | venv/
103 | ENV/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | services:
 4 | - docker
 5 | cache:
 6 |   directories:
 7 |   - "$HOME/.cache/pip"
 8 | addons:
 9 |   apt:
10 |     packages:
11 |       - libsnappy-dev
12 | _install: &_install
13 |   - gimme 1.8
14 |   - source ~/.gimme/envs/latest.env
15 |   - pip install --upgrade pip
16 |   - pip install codecov
17 |   - pip install -e .
18 | _coverage: &_coverage
19 |   - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
20 | matrix:
21 |   include:
22 |     - python: 3.4
23 |       env: *_coverage
24 |       install: *_install
25 |     - python: 3.5
26 |       env: *_coverage
27 |       install: *_install
28 |     - python: 3.6
29 |       env: SCRIPT="flake8 ."
30 |       install: pip install flake8
31 |     - python: 3.6
32 |       env: *_coverage
33 |       install: *_install
34 |       after_success:
35 |         - codecov
36 |   fast_finish: true
37 | before_script:
38 |   - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
39 |   - docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver
40 |   - docker run --name scylla -p 9042:9042 -d scylladb/scylla --developer-mode=1
41 | script:
42 |   - (eval "$SCRIPT")
43 | notifications:
44 |   email: false
45 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | education, socio-economic status, nationality, personal appearance, race,
10 | religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at conduct@sourced.tech. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Apollo project is [GPL licensed](LICENSE.md) and accepts
 4 | contributions via GitHub pull requests.  This document outlines some of the
 5 | conventions on development workflow, commit message formatting, contact points,
 6 | and other resources to make it easier to get your contribution accepted.
 7 | 
 8 | ## Certificate of Origin
 9 | 
10 | By contributing to this project you agree to the [Developer Certificate of
11 | Origin (DCO)](DCO). This document was created by the Linux Kernel community and is a
12 | simple statement that you, as a contributor, have the legal right to make the
13 | contribution.
14 | 
15 | In order to show your agreement with the DCO you should include at the end of commit message,
16 | the following line: `Signed-off-by: John Doe <john.doe@example.com>`, using your real name.
17 | 
18 | This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d3aa82f6583b9aadfecc5f8fa1c74aca09/Documentation/git-commit.txt#L154-L161) flag on the `git commit`.
19 | 
20 | 
21 | ## Support Channels
22 | 
23 | The official support channels, for both users and contributors, are:
24 | 
25 | - GitHub [issues](https://github.com/src-d/apollo/issues)*
26 | - Slack: #machine-learning room in the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM)
27 | 
28 | *Before opening a new issue or submitting a new pull request, it's helpful to
29 | search the project - it's likely that another user has already reported the
30 | issue you're facing, or it's a known issue that we're already aware of.
31 | 
32 | 
33 | ## How to Contribute
34 | 
35 | Pull Requests (PRs) are the main and exclusive way to contribute to the official Apollo project.
36 | In order for a PR to be accepted it needs to pass a list of requirements:
37 | 
38 | - Code Coverage does not decrease.
39 | - All the tests pass.
40 | - The code is formatted according to [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/).
41 | - If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged.
42 | - If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality.
43 | - In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS.md).
44 | 
45 | 
46 | ### Format of the commit message
47 | 
48 | The commit summary must start with a capital letter and with a verb in present tense. No dot in the end.
49 | 
50 | ```
51 | Add a feature
52 | Remove unused code
53 | Fix a bug
54 | ```
55 | 
56 | Every commit details should describe what was changed, under which context and, if applicable, the GitHub issue it relates to.
57 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 1 Letterman Drive
 6 | Suite D4700
 7 | San Francisco, CA, 94129
 8 | 
 9 | Everyone is permitted to copy and distribute verbatim copies of this
10 | license document, but changing it is not allowed.
11 | 
12 | 
13 | Developer's Certificate of Origin 1.1
14 | 
15 | By making a contribution to this project, I certify that:
16 | 
17 | (a) The contribution was created in whole or in part by me and I
18 |     have the right to submit it under the open source license
19 |     indicated in the file; or
20 | 
21 | (b) The contribution is based upon previous work that, to the best
22 |     of my knowledge, is covered under an appropriate open source
23 |     license and I have the right under that license to submit that
24 |     work with modifications, whether created in whole or in part
25 |     by me, under the same open source license (unless I am
26 |     permitted to submit under a different license), as indicated
27 |     in the file; or
28 | 
29 | (c) The contribution was provided directly to me by some other
30 |     person who certified (a), (b) or (c) and I have not modified
31 |     it.
32 | 
33 | (d) I understand and agree that this project and the contribution
34 |     are public and that a record of the contribution (including all
35 |     personal information I submit with it, including my sign-off) is
36 |     maintained indefinitely and may be redistributed consistent with
37 |     this project or the open source license(s) involved.


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # The underlying base is ubuntu:16.04
 2 | FROM nvidia/cuda:8.0-runtime
 3 | 
 4 | # NVIDIA driver version must match the host!
 5 | ENV DRIVER_VERSION 384.69
 6 | RUN mkdir -p /opt/nvidia && cd /opt/nvidia/ \
 7 |     && apt-get update && apt-get install -y wget module-init-tools && apt-get clean && rm -rf /var/lib/apt/lists/* \
 8 |     && wget http://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run -O /opt/nvidia/driver.run \
 9 |     && chmod +x /opt/nvidia/driver.run \
10 |     && /opt/nvidia/driver.run -s --no-nvidia-modprobe --no-kernel-module --no-nouveau-check --no-distro-scripts --no-opengl-files --no-kernel-module-source \
11 |     && rm -rf /opt/nvidia && apt-get purge -y module-init-tools && apt-get autoremove -y
12 | 
13 | RUN apt-get update && \
14 |     apt-get install -y --no-install-suggests --no-install-recommends \
15 |         ca-certificates locales git python3 libpython3.5 python3-dev \
16 |         libgomp1 libxml2 libxml2-dev zlib1g-dev \
17 |         libsnappy1v5 libsnappy-dev libonig2 make gcc g++ curl openjdk-8-jre && \
18 |     curl https://bootstrap.pypa.io/get-pip.py | python3 && \
19 |     pip3 install --no-cache-dir PyStemmer bblfsh py4j==0.10.4 modelforge parquet jinja2 libMHCUDA datasketch cassandra_driver python-igraph numpy humanize pygments && \
20 |     apt-get remove -y python3-dev libxml2-dev libsnappy-dev zlib1g-dev make gcc g++ curl && \
21 |     apt-get remove -y *-doc *-man >/dev/null && \
22 |     apt-get autoremove -y && \
23 |     apt-get clean && \
24 |     rm -rf /var/lib/apt/lists/* && \
25 |     locale-gen en_US.UTF-8
26 | 
27 | # sudo mount -o bind ... bundle/*
28 | ADD bundle/spark /spark/
29 | ADD bundle/engine/python /bundle/sourced/engine/
30 | ADD bundle/ml /bundle/sourced/ml/
31 | 
32 | ADD apollo/ /packages/apollo/apollo/
33 | ADD setup.py /packages/apollo
34 | 
35 | ENV PYTHONPATH /packages:/spark/python
36 | ENV LANG en_US.UTF-8
37 | WORKDIR /packages
38 | 
39 | RUN echo '0.5.2' > /bundle/sourced/engine/version.txt && pip3 install -e /bundle/sourced/engine/
40 | RUN pip3 install -e /bundle/sourced/ml/
41 | RUN pip3 install --no-deps -e apollo/ && apollo warmup -s 'local[*]'
42 | 
43 | ENTRYPOINT ["apollo"]
44 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | ### GNU GENERAL PUBLIC LICENSE
  2 | 
  3 | Version 3, 29 June 2007
  4 | 
  5 | Copyright (C) 2007 Free Software Foundation, Inc.
  6 | <https://fsf.org/>
  7 | 
  8 | Everyone is permitted to copy and distribute verbatim copies of this
  9 | license document, but changing it is not allowed.
 10 | 
 11 | ### Preamble
 12 | 
 13 | The GNU General Public License is a free, copyleft license for
 14 | software and other kinds of works.
 15 | 
 16 | The licenses for most software and other practical works are designed
 17 | to take away your freedom to share and change the works. By contrast,
 18 | the GNU General Public License is intended to guarantee your freedom
 19 | to share and change all versions of a program--to make sure it remains
 20 | free software for all its users. We, the Free Software Foundation, use
 21 | the GNU General Public License for most of our software; it applies
 22 | also to any other work released this way by its authors. You can apply
 23 | it to your programs, too.
 24 | 
 25 | When we speak of free software, we are referring to freedom, not
 26 | price. Our General Public Licenses are designed to make sure that you
 27 | have the freedom to distribute copies of free software (and charge for
 28 | them if you wish), that you receive source code or can get it if you
 29 | want it, that you can change the software or use pieces of it in new
 30 | free programs, and that you know you can do these things.
 31 | 
 32 | To protect your rights, we need to prevent others from denying you
 33 | these rights or asking you to surrender the rights. Therefore, you
 34 | have certain responsibilities if you distribute copies of the
 35 | software, or if you modify it: responsibilities to respect the freedom
 36 | of others.
 37 | 
 38 | For example, if you distribute copies of such a program, whether
 39 | gratis or for a fee, you must pass on to the recipients the same
 40 | freedoms that you received. You must make sure that they, too, receive
 41 | or can get the source code. And you must show them these terms so they
 42 | know their rights.
 43 | 
 44 | Developers that use the GNU GPL protect your rights with two steps:
 45 | (1) assert copyright on the software, and (2) offer you this License
 46 | giving you legal permission to copy, distribute and/or modify it.
 47 | 
 48 | For the developers' and authors' protection, the GPL clearly explains
 49 | that there is no warranty for this free software. For both users' and
 50 | authors' sake, the GPL requires that modified versions be marked as
 51 | changed, so that their problems will not be attributed erroneously to
 52 | authors of previous versions.
 53 | 
 54 | Some devices are designed to deny users access to install or run
 55 | modified versions of the software inside them, although the
 56 | manufacturer can do so. This is fundamentally incompatible with the
 57 | aim of protecting users' freedom to change the software. The
 58 | systematic pattern of such abuse occurs in the area of products for
 59 | individuals to use, which is precisely where it is most unacceptable.
 60 | Therefore, we have designed this version of the GPL to prohibit the
 61 | practice for those products. If such problems arise substantially in
 62 | other domains, we stand ready to extend this provision to those
 63 | domains in future versions of the GPL, as needed to protect the
 64 | freedom of users.
 65 | 
 66 | Finally, every program is threatened constantly by software patents.
 67 | States should not allow patents to restrict development and use of
 68 | software on general-purpose computers, but in those that do, we wish
 69 | to avoid the special danger that patents applied to a free program
 70 | could make it effectively proprietary. To prevent this, the GPL
 71 | assures that patents cannot be used to render the program non-free.
 72 | 
 73 | The precise terms and conditions for copying, distribution and
 74 | modification follow.
 75 | 
 76 | ### TERMS AND CONDITIONS
 77 | 
 78 | #### 0. Definitions.
 79 | 
 80 | "This License" refers to version 3 of the GNU General Public License.
 81 | 
 82 | "Copyright" also means copyright-like laws that apply to other kinds
 83 | of works, such as semiconductor masks.
 84 | 
 85 | "The Program" refers to any copyrightable work licensed under this
 86 | License. Each licensee is addressed as "you". "Licensees" and
 87 | "recipients" may be individuals or organizations.
 88 | 
 89 | To "modify" a work means to copy from or adapt all or part of the work
 90 | in a fashion requiring copyright permission, other than the making of
 91 | an exact copy. The resulting work is called a "modified version" of
 92 | the earlier work or a work "based on" the earlier work.
 93 | 
 94 | A "covered work" means either the unmodified Program or a work based
 95 | on the Program.
 96 | 
 97 | To "propagate" a work means to do anything with it that, without
 98 | permission, would make you directly or secondarily liable for
 99 | infringement under applicable copyright law, except executing it on a
100 | computer or modifying a private copy. Propagation includes copying,
101 | distribution (with or without modification), making available to the
102 | public, and in some countries other activities as well.
103 | 
104 | To "convey" a work means any kind of propagation that enables other
105 | parties to make or receive copies. Mere interaction with a user
106 | through a computer network, with no transfer of a copy, is not
107 | conveying.
108 | 
109 | An interactive user interface displays "Appropriate Legal Notices" to
110 | the extent that it includes a convenient and prominently visible
111 | feature that (1) displays an appropriate copyright notice, and (2)
112 | tells the user that there is no warranty for the work (except to the
113 | extent that warranties are provided), that licensees may convey the
114 | work under this License, and how to view a copy of this License. If
115 | the interface presents a list of user commands or options, such as a
116 | menu, a prominent item in the list meets this criterion.
117 | 
118 | #### 1. Source Code.
119 | 
120 | The "source code" for a work means the preferred form of the work for
121 | making modifications to it. "Object code" means any non-source form of
122 | a work.
123 | 
124 | A "Standard Interface" means an interface that either is an official
125 | standard defined by a recognized standards body, or, in the case of
126 | interfaces specified for a particular programming language, one that
127 | is widely used among developers working in that language.
128 | 
129 | The "System Libraries" of an executable work include anything, other
130 | than the work as a whole, that (a) is included in the normal form of
131 | packaging a Major Component, but which is not part of that Major
132 | Component, and (b) serves only to enable use of the work with that
133 | Major Component, or to implement a Standard Interface for which an
134 | implementation is available to the public in source code form. A
135 | "Major Component", in this context, means a major essential component
136 | (kernel, window system, and so on) of the specific operating system
137 | (if any) on which the executable work runs, or a compiler used to
138 | produce the work, or an object code interpreter used to run it.
139 | 
140 | The "Corresponding Source" for a work in object code form means all
141 | the source code needed to generate, install, and (for an executable
142 | work) run the object code and to modify the work, including scripts to
143 | control those activities. However, it does not include the work's
144 | System Libraries, or general-purpose tools or generally available free
145 | programs which are used unmodified in performing those activities but
146 | which are not part of the work. For example, Corresponding Source
147 | includes interface definition files associated with source files for
148 | the work, and the source code for shared libraries and dynamically
149 | linked subprograms that the work is specifically designed to require,
150 | such as by intimate data communication or control flow between those
151 | subprograms and other parts of the work.
152 | 
153 | The Corresponding Source need not include anything that users can
154 | regenerate automatically from other parts of the Corresponding Source.
155 | 
156 | The Corresponding Source for a work in source code form is that same
157 | work.
158 | 
159 | #### 2. Basic Permissions.
160 | 
161 | All rights granted under this License are granted for the term of
162 | copyright on the Program, and are irrevocable provided the stated
163 | conditions are met. This License explicitly affirms your unlimited
164 | permission to run the unmodified Program. The output from running a
165 | covered work is covered by this License only if the output, given its
166 | content, constitutes a covered work. This License acknowledges your
167 | rights of fair use or other equivalent, as provided by copyright law.
168 | 
169 | You may make, run and propagate covered works that you do not convey,
170 | without conditions so long as your license otherwise remains in force.
171 | You may convey covered works to others for the sole purpose of having
172 | them make modifications exclusively for you, or provide you with
173 | facilities for running those works, provided that you comply with the
174 | terms of this License in conveying all material for which you do not
175 | control copyright. Those thus making or running the covered works for
176 | you must do so exclusively on your behalf, under your direction and
177 | control, on terms that prohibit them from making any copies of your
178 | copyrighted material outside their relationship with you.
179 | 
180 | Conveying under any other circumstances is permitted solely under the
181 | conditions stated below. Sublicensing is not allowed; section 10 makes
182 | it unnecessary.
183 | 
184 | #### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
185 | 
186 | No covered work shall be deemed part of an effective technological
187 | measure under any applicable law fulfilling obligations under article
188 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
189 | similar laws prohibiting or restricting circumvention of such
190 | measures.
191 | 
192 | When you convey a covered work, you waive any legal power to forbid
193 | circumvention of technological measures to the extent such
194 | circumvention is effected by exercising rights under this License with
195 | respect to the covered work, and you disclaim any intention to limit
196 | operation or modification of the work as a means of enforcing, against
197 | the work's users, your or third parties' legal rights to forbid
198 | circumvention of technological measures.
199 | 
200 | #### 4. Conveying Verbatim Copies.
201 | 
202 | You may convey verbatim copies of the Program's source code as you
203 | receive it, in any medium, provided that you conspicuously and
204 | appropriately publish on each copy an appropriate copyright notice;
205 | keep intact all notices stating that this License and any
206 | non-permissive terms added in accord with section 7 apply to the code;
207 | keep intact all notices of the absence of any warranty; and give all
208 | recipients a copy of this License along with the Program.
209 | 
210 | You may charge any price or no price for each copy that you convey,
211 | and you may offer support or warranty protection for a fee.
212 | 
213 | #### 5. Conveying Modified Source Versions.
214 | 
215 | You may convey a work based on the Program, or the modifications to
216 | produce it from the Program, in the form of source code under the
217 | terms of section 4, provided that you also meet all of these
218 | conditions:
219 | 
220 | -   a) The work must carry prominent notices stating that you modified
221 |     it, and giving a relevant date.
222 | -   b) The work must carry prominent notices stating that it is
223 |     released under this License and any conditions added under
224 |     section 7. This requirement modifies the requirement in section 4
225 |     to "keep intact all notices".
226 | -   c) You must license the entire work, as a whole, under this
227 |     License to anyone who comes into possession of a copy. This
228 |     License will therefore apply, along with any applicable section 7
229 |     additional terms, to the whole of the work, and all its parts,
230 |     regardless of how they are packaged. This License gives no
231 |     permission to license the work in any other way, but it does not
232 |     invalidate such permission if you have separately received it.
233 | -   d) If the work has interactive user interfaces, each must display
234 |     Appropriate Legal Notices; however, if the Program has interactive
235 |     interfaces that do not display Appropriate Legal Notices, your
236 |     work need not make them do so.
237 | 
238 | A compilation of a covered work with other separate and independent
239 | works, which are not by their nature extensions of the covered work,
240 | and which are not combined with it such as to form a larger program,
241 | in or on a volume of a storage or distribution medium, is called an
242 | "aggregate" if the compilation and its resulting copyright are not
243 | used to limit the access or legal rights of the compilation's users
244 | beyond what the individual works permit. Inclusion of a covered work
245 | in an aggregate does not cause this License to apply to the other
246 | parts of the aggregate.
247 | 
248 | #### 6. Conveying Non-Source Forms.
249 | 
250 | You may convey a covered work in object code form under the terms of
251 | sections 4 and 5, provided that you also convey the machine-readable
252 | Corresponding Source under the terms of this License, in one of these
253 | ways:
254 | 
255 | -   a) Convey the object code in, or embodied in, a physical product
256 |     (including a physical distribution medium), accompanied by the
257 |     Corresponding Source fixed on a durable physical medium
258 |     customarily used for software interchange.
259 | -   b) Convey the object code in, or embodied in, a physical product
260 |     (including a physical distribution medium), accompanied by a
261 |     written offer, valid for at least three years and valid for as
262 |     long as you offer spare parts or customer support for that product
263 |     model, to give anyone who possesses the object code either (1) a
264 |     copy of the Corresponding Source for all the software in the
265 |     product that is covered by this License, on a durable physical
266 |     medium customarily used for software interchange, for a price no
267 |     more than your reasonable cost of physically performing this
268 |     conveying of source, or (2) access to copy the Corresponding
269 |     Source from a network server at no charge.
270 | -   c) Convey individual copies of the object code with a copy of the
271 |     written offer to provide the Corresponding Source. This
272 |     alternative is allowed only occasionally and noncommercially, and
273 |     only if you received the object code with such an offer, in accord
274 |     with subsection 6b.
275 | -   d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge. You need not require recipients to copy the
279 |     Corresponding Source along with the object code. If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source. Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | -   e) Convey the object code using peer-to-peer transmission,
288 |     provided you inform other peers where the object code and
289 |     Corresponding Source of the work are being offered to the general
290 |     public at no charge under subsection 6d.
291 | 
292 | A separable portion of the object code, whose source code is excluded
293 | from the Corresponding Source as a System Library, need not be
294 | included in conveying the object code work.
295 | 
296 | A "User Product" is either (1) a "consumer product", which means any
297 | tangible personal property which is normally used for personal,
298 | family, or household purposes, or (2) anything designed or sold for
299 | incorporation into a dwelling. In determining whether a product is a
300 | consumer product, doubtful cases shall be resolved in favor of
301 | coverage. For a particular product received by a particular user,
302 | "normally used" refers to a typical or common use of that class of
303 | product, regardless of the status of the particular user or of the way
304 | in which the particular user actually uses, or expects or is expected
305 | to use, the product. A product is a consumer product regardless of
306 | whether the product has substantial commercial, industrial or
307 | non-consumer uses, unless such uses represent the only significant
308 | mode of use of the product.
309 | 
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to
312 | install and execute modified versions of a covered work in that User
313 | Product from a modified version of its Corresponding Source. The
314 | information must suffice to ensure that the continued functioning of
315 | the modified object code is in no case prevented or interfered with
316 | solely because modification has been made.
317 | 
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or
331 | updates for a work that has been modified or installed by the
332 | recipient, or for the User Product in which it has been modified or
333 | installed. Access to a network may be denied when the modification
334 | itself materially and adversely affects the operation of the network
335 | or violates the rules and protocols for communication across the
336 | network.
337 | 
338 | Corresponding Source conveyed, and Installation Information provided,
339 | in accord with this section must be in a format that is publicly
340 | documented (and with an implementation available to the public in
341 | source code form), and must require no special password or key for
342 | unpacking, reading or copying.
343 | 
344 | #### 7. Additional Terms.
345 | 
346 | "Additional permissions" are terms that supplement the terms of this
347 | License by making exceptions from one or more of its conditions.
348 | Additional permissions that are applicable to the entire Program shall
349 | be treated as though they were included in this License, to the extent
350 | that they are valid under applicable law. If additional permissions
351 | apply only to part of the Program, that part may be used separately
352 | under those permissions, but the entire Program remains governed by
353 | this License without regard to the additional permissions.
354 | 
355 | When you convey a copy of a covered work, you may at your option
356 | remove any additional permissions from that copy, or from any part of
357 | it. (Additional permissions may be written to require their own
358 | removal in certain cases when you modify the work.) You may place
359 | additional permissions on material, added by you to a covered work,
360 | for which you have or can give appropriate copyright permission.
361 | 
362 | Notwithstanding any other provision of this License, for material you
363 | add to a covered work, you may (if authorized by the copyright holders
364 | of that material) supplement the terms of this License with terms:
365 | 
366 | -   a) Disclaiming warranty or limiting liability differently from the
367 |     terms of sections 15 and 16 of this License; or
368 | -   b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | -   c) Prohibiting misrepresentation of the origin of that material,
372 |     or requiring that modified versions of such material be marked in
373 |     reasonable ways as different from the original version; or
374 | -   d) Limiting the use for publicity purposes of names of licensors
375 |     or authors of the material; or
376 | -   e) Declining to grant rights under trademark law for use of some
377 |     trade names, trademarks, or service marks; or
378 | -   f) Requiring indemnification of licensors and authors of that
379 |     material by anyone who conveys the material (or modified versions
380 |     of it) with contractual assumptions of liability to the recipient,
381 |     for any liability that these contractual assumptions directly
382 |     impose on those licensors and authors.
383 | 
384 | All other non-permissive additional terms are considered "further
385 | restrictions" within the meaning of section 10. If the Program as you
386 | received it, or any part of it, contains a notice stating that it is
387 | governed by this License along with a term that is a further
388 | restriction, you may remove that term. If a license document contains
389 | a further restriction but permits relicensing or conveying under this
390 | License, you may add to a covered work material governed by the terms
391 | of that license document, provided that the further restriction does
392 | not survive such relicensing or conveying.
393 | 
394 | If you add terms to a covered work in accord with this section, you
395 | must place, in the relevant source files, a statement of the
396 | additional terms that apply to those files, or a notice indicating
397 | where to find the applicable terms.
398 | 
399 | Additional terms, permissive or non-permissive, may be stated in the
400 | form of a separately written license, or stated as exceptions; the
401 | above requirements apply either way.
402 | 
403 | #### 8. Termination.
404 | 
405 | You may not propagate or modify a covered work except as expressly
406 | provided under this License. Any attempt otherwise to propagate or
407 | modify it is void, and will automatically terminate your rights under
408 | this License (including any patent licenses granted under the third
409 | paragraph of section 11).
410 | 
411 | However, if you cease all violation of this License, then your license
412 | from a particular copyright holder is reinstated (a) provisionally,
413 | unless and until the copyright holder explicitly and finally
414 | terminates your license, and (b) permanently, if the copyright holder
415 | fails to notify you of the violation by some reasonable means prior to
416 | 60 days after the cessation.
417 | 
418 | Moreover, your license from a particular copyright holder is
419 | reinstated permanently if the copyright holder notifies you of the
420 | violation by some reasonable means, this is the first time you have
421 | received notice of violation of this License (for any work) from that
422 | copyright holder, and you cure the violation prior to 30 days after
423 | your receipt of the notice.
424 | 
425 | Termination of your rights under this section does not terminate the
426 | licenses of parties who have received copies or rights from you under
427 | this License. If your rights have been terminated and not permanently
428 | reinstated, you do not qualify to receive new licenses for the same
429 | material under section 10.
430 | 
431 | #### 9. Acceptance Not Required for Having Copies.
432 | 
433 | You are not required to accept this License in order to receive or run
434 | a copy of the Program. Ancillary propagation of a covered work
435 | occurring solely as a consequence of using peer-to-peer transmission
436 | to receive a copy likewise does not require acceptance. However,
437 | nothing other than this License grants you permission to propagate or
438 | modify any covered work. These actions infringe copyright if you do
439 | not accept this License. Therefore, by modifying or propagating a
440 | covered work, you indicate your acceptance of this License to do so.
441 | 
442 | #### 10. Automatic Licensing of Downstream Recipients.
443 | 
444 | Each time you convey a covered work, the recipient automatically
445 | receives a license from the original licensors, to run, modify and
446 | propagate that work, subject to this License. You are not responsible
447 | for enforcing compliance by third parties with this License.
448 | 
449 | An "entity transaction" is a transaction transferring control of an
450 | organization, or substantially all assets of one, or subdividing an
451 | organization, or merging organizations. If propagation of a covered
452 | work results from an entity transaction, each party to that
453 | transaction who receives a copy of the work also receives whatever
454 | licenses to the work the party's predecessor in interest had or could
455 | give under the previous paragraph, plus a right to possession of the
456 | Corresponding Source of the work from the predecessor in interest, if
457 | the predecessor has it or can get it with reasonable efforts.
458 | 
459 | You may not impose any further restrictions on the exercise of the
460 | rights granted or affirmed under this License. For example, you may
461 | not impose a license fee, royalty, or other charge for exercise of
462 | rights granted under this License, and you may not initiate litigation
463 | (including a cross-claim or counterclaim in a lawsuit) alleging that
464 | any patent claim is infringed by making, using, selling, offering for
465 | sale, or importing the Program or any portion of it.
466 | 
467 | #### 11. Patents.
468 | 
469 | A "contributor" is a copyright holder who authorizes use under this
470 | License of the Program or a work on which the Program is based. The
471 | work thus licensed is called the contributor's "contributor version".
472 | 
473 | A contributor's "essential patent claims" are all patent claims owned
474 | or controlled by the contributor, whether already acquired or
475 | hereafter acquired, that would be infringed by some manner, permitted
476 | by this License, of making, using, or selling its contributor version,
477 | but do not include claims that would be infringed only as a
478 | consequence of further modification of the contributor version. For
479 | purposes of this definition, "control" includes the right to grant
480 | patent sublicenses in a manner consistent with the requirements of
481 | this License.
482 | 
483 | Each contributor grants you a non-exclusive, worldwide, royalty-free
484 | patent license under the contributor's essential patent claims, to
485 | make, use, sell, offer for sale, import and otherwise run, modify and
486 | propagate the contents of its contributor version.
487 | 
488 | In the following three paragraphs, a "patent license" is any express
489 | agreement or commitment, however denominated, not to enforce a patent
490 | (such as an express permission to practice a patent or covenant not to
491 | sue for patent infringement). To "grant" such a patent license to a
492 | party means to make such an agreement or commitment not to enforce a
493 | patent against the party.
494 | 
495 | If you convey a covered work, knowingly relying on a patent license,
496 | and the Corresponding Source of the work is not available for anyone
497 | to copy, free of charge and under the terms of this License, through a
498 | publicly available network server or other readily accessible means,
499 | then you must either (1) cause the Corresponding Source to be so
500 | available, or (2) arrange to deprive yourself of the benefit of the
501 | patent license for this particular work, or (3) arrange, in a manner
502 | consistent with the requirements of this License, to extend the patent
503 | license to downstream recipients. "Knowingly relying" means you have
504 | actual knowledge that, but for the patent license, your conveying the
505 | covered work in a country, or your recipient's use of the covered work
506 | in a country, would infringe one or more identifiable patents in that
507 | country that you have reason to believe are valid.
508 | 
509 | If, pursuant to or in connection with a single transaction or
510 | arrangement, you convey, or propagate by procuring conveyance of, a
511 | covered work, and grant a patent license to some of the parties
512 | receiving the covered work authorizing them to use, propagate, modify
513 | or convey a specific copy of the covered work, then the patent license
514 | you grant is automatically extended to all recipients of the covered
515 | work and works based on it.
516 | 
517 | A patent license is "discriminatory" if it does not include within the
518 | scope of its coverage, prohibits the exercise of, or is conditioned on
519 | the non-exercise of one or more of the rights that are specifically
520 | granted under this License. You may not convey a covered work if you
521 | are a party to an arrangement with a third party that is in the
522 | business of distributing software, under which you make payment to the
523 | third party based on the extent of your activity of conveying the
524 | work, and under which the third party grants, to any of the parties
525 | who would receive the covered work from you, a discriminatory patent
526 | license (a) in connection with copies of the covered work conveyed by
527 | you (or copies made from those copies), or (b) primarily for and in
528 | connection with specific products or compilations that contain the
529 | covered work, unless you entered into that arrangement, or that patent
530 | license was granted, prior to 28 March 2007.
531 | 
532 | Nothing in this License shall be construed as excluding or limiting
533 | any implied license or other defenses to infringement that may
534 | otherwise be available to you under applicable patent law.
535 | 
536 | #### 12. No Surrender of Others' Freedom.
537 | 
538 | If conditions are imposed on you (whether by court order, agreement or
539 | otherwise) that contradict the conditions of this License, they do not
540 | excuse you from the conditions of this License. If you cannot convey a
541 | covered work so as to satisfy simultaneously your obligations under
542 | this License and any other pertinent obligations, then as a
543 | consequence you may not convey it at all. For example, if you agree to
544 | terms that obligate you to collect a royalty for further conveying
545 | from those to whom you convey the Program, the only way you could
546 | satisfy both those terms and this License would be to refrain entirely
547 | from conveying the Program.
548 | 
549 | #### 13. Use with the GNU Affero General Public License.
550 | 
551 | Notwithstanding any other provision of this License, you have
552 | permission to link or combine any covered work with a work licensed
553 | under version 3 of the GNU Affero General Public License into a single
554 | combined work, and to convey the resulting work. The terms of this
555 | License will continue to apply to the part which is the covered work,
556 | but the special requirements of the GNU Affero General Public License,
557 | section 13, concerning interaction through a network will apply to the
558 | combination as such.
559 | 
560 | #### 14. Revised Versions of this License.
561 | 
562 | The Free Software Foundation may publish revised and/or new versions
563 | of the GNU General Public License from time to time. Such new versions
564 | will be similar in spirit to the present version, but may differ in
565 | detail to address new problems or concerns.
566 | 
567 | Each version is given a distinguishing version number. If the Program
568 | specifies that a certain numbered version of the GNU General Public
569 | License "or any later version" applies to it, you have the option of
570 | following the terms and conditions either of that numbered version or
571 | of any later version published by the Free Software Foundation. If the
572 | Program does not specify a version number of the GNU General Public
573 | License, you may choose any version ever published by the Free
574 | Software Foundation.
575 | 
576 | If the Program specifies that a proxy can decide which future versions
577 | of the GNU General Public License can be used, that proxy's public
578 | statement of acceptance of a version permanently authorizes you to
579 | choose that version for the Program.
580 | 
581 | Later license versions may give you additional or different
582 | permissions. However, no additional obligations are imposed on any
583 | author or copyright holder as a result of your choosing to follow a
584 | later version.
585 | 
586 | #### 15. Disclaimer of Warranty.
587 | 
588 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
589 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
590 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT
591 | WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT
592 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
593 | A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND
594 | PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
595 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
596 | CORRECTION.
597 | 
598 | #### 16. Limitation of Liability.
599 | 
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR
602 | CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
603 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
604 | ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT
605 | NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR
606 | LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM
607 | TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER
608 | PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
609 | 
610 | #### 17. Interpretation of Sections 15 and 16.
611 | 
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 | END OF TERMS AND CONDITIONS
620 | 
621 | ### How to Apply These Terms to Your New Programs
622 | 
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these
626 | terms.
627 | 
628 | To do so, attach the following notices to the program. It is safest to
629 | attach them to the start of each source file to most effectively state
630 | the exclusion of warranty; and each file should have at least the
631 | "copyright" line and a pointer to where the full notice is found.
632 | 
633 |         <one line to give the program's name and a brief idea of what it does.>
634 |         Copyright (C) <year>  <name of author>
635 | 
636 |         This program is free software: you can redistribute it and/or modify
637 |         it under the terms of the GNU General Public License as published by
638 |         the Free Software Foundation, either version 3 of the License, or
639 |         (at your option) any later version.
640 | 
641 |         This program is distributed in the hope that it will be useful,
642 |         but WITHOUT ANY WARRANTY; without even the implied warranty of
643 |         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
644 |         GNU General Public License for more details.
645 | 
646 |         You should have received a copy of the GNU General Public License
647 |         along with this program.  If not, see <https://www.gnu.org/licenses/>.
648 | 
649 | Also add information on how to contact you by electronic and paper
650 | mail.
651 | 
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |         <program>  Copyright (C) <year>  <name of author>
656 |         This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |         This is free software, and you are welcome to redistribute it
658 |         under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands \`show w' and \`show c' should show the
661 | appropriate parts of the General Public License. Of course, your
662 | program's commands might be different; for a GUI interface, you would
663 | use an "about box".
664 | 
665 | You should also get your employer (if you work as a programmer) or
666 | school, if any, to sign a "copyright disclaimer" for the program, if
667 | necessary. For more information on this, and how to apply and follow
668 | the GNU GPL, see <https://www.gnu.org/licenses/>.
669 | 
670 | The GNU General Public License does not permit incorporating your
671 | program into proprietary programs. If your program is a subroutine
672 | library, you may consider it more useful to permit linking proprietary
673 | applications with the library. If this is what you want to do, use the
674 | GNU Lesser General Public License instead of this License. But first,
675 | please read <https://www.gnu.org/licenses/why-not-lgpl.html>.


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Vadim Markovtsev <vadim@sourced.tech> (@vmarkovtsev)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Apollo
  2 | ======
  3 | 
  4 | Advanced code deduplicator. Powered by [source\{d\} ML](https://github.com/src-d/ml),
  5 | [source\{d\} engine](https://github.com/src-d/engine) and [minhashcuda](https://github.com/src-d/minhashcuda).
  6 | Agnostic to the analysed language thanks to [Babelfish](https://doc.bblf.sh). Python 3, PySpark, CUDA inside.
  7 | 
  8 | ### What is this?
  9 | 
 10 | source{d}'s effort to research and solve the code deduplication problem. At scale, as usual.
 11 | A [code clone](https://en.wikipedia.org/wiki/Duplicate_code) is several snippets of code with few differences.
 12 | For now this project focuses on find near-duplicate projects and files; it will eventually support
 13 | functions and snippets in the future.
 14 | 
 15 | ### Should I use it?
 16 | 
 17 | If you've got hundreds of thousands of files or more, consider. Otherwise, use one of the many
 18 | existing tools which may be already integrated into your IDE.
 19 | 
 20 | ### Difference from [src-d/gemini](https://github.com/src-d/gemini)?
 21 | 
 22 | This guy is my brother. Apollo focuses on research, extensibility, flexibility and rapid
 23 | changes, while Gemini focuses on performance and serious production usage. All the proven and 
 24 | tested features will be eventually ported to Gemini. At the same time, Gemini may reuse some
 25 | of Apollo's code.
 26 | 
 27 | ### Algorithm
 28 | 
 29 | Apollo takes the "hash'em all" approach. We extract unordered weighted features from code aka "weighted bags",
 30 | apply [Weighted MinHash](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf)
 31 | and then design the [Locality Sensitive Hashing index](http://infolab.stanford.edu/~ullman/mmds/ch3.pdf).
 32 | All items which appear in the same hashtable bucket are considered the same. The size of the hash
 33 | and the number of hashtables depend on the [weighted Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index#Generalized_Jaccard_similarity_and_distance)
 34 | threshold (hence Weighted MinHash).
 35 | 
 36 | The features include identifiers such as variable, function or class names, literal values and *structural elements*.
 37 | The latter carries the topological information, and we currently support several variants: "node2vec",
 38 | "deterministic node2vec" and "role-children atoms". Graphlets are upcoming. Different features
 39 | have different weights which will be tuned by a hyperparameter optimization algorithm or even an SGD
 40 | (not yet implemented).
 41 | 
 42 | It's not all unfortunately! Dumping the huge graph of pairwise similarities is of little practicality.
 43 | We need to group (cluster) the neighborhoods of densely connected nodes. Apollo solves this problem
 44 | in two steps:
 45 | 
 46 | 1. Run [connected components](https://en.wikipedia.org/wiki/Connected_component_(graph_theory))
 47 | analysis to find disjoint parts in the similarity graph.
 48 | 2. Run [community detection](https://en.wikipedia.org/wiki/Community_structure) to cluster the components.
 49 | The clusters are with overlaps.
 50 | 
 51 | ### Implementation
 52 | 
 53 | Apollo is structured as a series of commands in CLI. It stores data in [Cassandra](http://cassandra.apache.org/)
 54 | (compatible with [Scylla](http://www.scylladb.com/)) and
 55 | writes MinHashCuda batches on disk. Community detection is delegated to [igraph](http://igraph.org/python/).
 56 | 
 57 | * `resetdb` (erases) and initializes a Cassandra keyspace.
 58 | * `bags` extracts the features, stores them in the database and writes MinHashCuda batches on disk.
 59 | Runs source{d} engine through PySpark.
 60 | * `hash` performs the hashing, writes the hashtables to the database and hashing parameters on disk
 61 | in [Modelforge](https://github.com/src-d/modelforge) format.
 62 | * `cc` fetches the buckets, runs the connected component analysis and writes the result on disk in Modelforge
 63 | format. Uses PySpark.
 64 | * `cmd` reads the connected components and performs the community detection (by default, walktrap).
 65 | Uses PySpark.
 66 | * `query` outputs items similar to the specified. In case of files, the path or the sha1 are accepted.
 67 | * `dumpcmd` outputs the groups of similar items.
 68 | 
 69 | ### Installation
 70 | 
 71 | ```
 72 | mount -o bind /path/to/sourced-ml bundle/ml
 73 | mount -o bind /path/to/spark-2.2.0-bin-hadoop2.7 bundle/spark
 74 | mount -o bind /path/to/sourced-engine bundle/engine
 75 | docker build -t srcd/apollo .
 76 | docker run --name scylla -p 9042:9042 -v /var/lib/scylla:/var/lib/scylla -d scylladb/scylla --developer-mode=1
 77 | docker run -it --rm --link scylla srcd/apollo resetdb --cassandra scylla
 78 | docker run -d --name bblfshd --privileged -p 9432:9432 -v /var/lib/bblfshd:/var/lib/bblfshd bblfsh/bblfshd
 79 | docker exec -it bblfshd bblfshctl driver install --all
 80 | ```
 81 | 
 82 | You are going to need [grip](https://github.com/joeyespo/grip) to instantly render Markdown reports
 83 | in your browser. There multiple Docker options available, e.g.
 84 | [1](https://github.com/psycofdj/docker-grip), [2](https://github.com/fstab/docker-grip),
 85 | [3](https://github.com/kba/grip-docker).
 86 | 
 87 | ### Contributions
 88 | 
 89 | ...are welcome! See [CONTRIBUTING](CONTRIBUTING.md) and [code of conduct](CODE_OF_CONDUCT.md).
 90 | 
 91 | ### License
 92 | 
 93 | [GPL](LICENSE.md).
 94 | 
 95 | ## Docker command snippets
 96 | 
 97 | ### Bags
 98 | 
 99 | ```
100 | docker run -it --rm -v /path/to/io:/io --link bblfshd --link scylla srcd/apollo bags -r /io/siva \
101 | --bow /io/bags/bow.asdf --docfreq /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \
102 | -l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --cassandra scylla --persist MEMORY_ONLY \
103 | --config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G
104 | ```
105 | 
106 | ### Hash
107 | 
108 | ```
109 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo hash /io/batches/bow*.asdf -p /io/bags/params.asdf \
110 | -t 0.8 --cassandra scylla
111 | ```
112 | 
113 | ### Query sha1
114 | 
115 | ```
116 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo query -i <sha1> --precise \
117 | --docfreq /io/bags/docfreq.asdf -t 0.8 --cassandra scylla
118 | ```
119 | 
120 | ### Query file
121 | 
122 | ```
123 | docker run -it --rm -v /path/to/io:/io -v .:/q --link bblfshd --link scylla srcd/apollo query \
124 | -f /q/myfile.java --bblfsh bblfshd --cassandra scylla --precise --docfreq /io/docfreq.asdf \
125 | --params /io/params.asdf -t 0.9 | grip -b -
126 | ```
127 | 
128 | ### Connected components
129 | 
130 | ```
131 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo cc -o /io/ccs.asdf
132 | ```
133 | 
134 | ### Dump connected components
135 | 
136 | ```
137 | docker run -it --rm -v /path/to/io:/io srcd/apollo dumpcc -o /io/ccs.asdf
138 | ```
139 | 
140 | ### Community detection
141 | 
142 | ```
143 | docker run -it --rm -v /path/to/io:/io srcd/apollo cmd -i /io/ccs.asdf -o /io/communities.asdf -s 'local[*]'
144 | ```
145 | 
146 | ### Dump communities (final report)
147 | 
148 | ```
149 | docker run -it --rm -v /path/to/io:/io srcd/apollo dumpcmd /io/communities.asdf | grip -b -
150 | ```
151 | 


--------------------------------------------------------------------------------
/apollo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/apollo/fcdf67bb579681bbf978168e909cd74207ed06db/apollo/__init__.py


--------------------------------------------------------------------------------
/apollo/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import sys
  5 | from time import time
  6 | 
  7 | from igraph import Graph
  8 | from modelforge.logs import setup_logging
  9 | from sourced.ml import extractors
 10 | from sourced.ml.utils import add_engine_args, add_spark_args
 11 | from sourced.ml.cmd import ArgumentDefaultsHelpFormatterNoNone
 12 | from sourced.ml.cmd.args import add_bow_args, add_feature_args, add_repo2_args, \
 13 |     add_df_args, add_repartitioner_arg
 14 | 
 15 | from apollo.bags import preprocess, source2bags
 16 | from apollo.cassandra_utils import reset_db
 17 | from apollo.graph import find_connected_components, dumpcc, detect_communities, dumpcmd, \
 18 |     evaluate_communities
 19 | from apollo.hasher import hash_batches
 20 | from apollo.query import query
 21 | from apollo.warmup import warmup
 22 | 
 23 | 
 24 | CASSANDRA_PACKAGE = "com.datastax.spark:spark-cassandra-connector_2.11:2.0.3"
 25 | 
 26 | 
 27 | def get_parser() -> argparse.ArgumentParser:
 28 |     """
 29 |     Create the cmdline argument parser.
 30 |     """
 31 |     parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone)
 32 |     parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel,
 33 |                         help="Logging verbosity.")
 34 | 
 35 |     def add_feature_weight_arg(my_parser):
 36 |         help_desc = "%s's weight - all features from this extractor will be multiplied by this " \
 37 |                   "factor"
 38 |         for ex in extractors.__extractors__.values():
 39 |             my_parser.add_argument("--%s-weight" % ex.NAME, default=1, type=float,
 40 |                                    help=help_desc % ex.__name__)
 41 | 
 42 |     def add_cassandra_args(my_parser):
 43 |         my_parser.add_argument(
 44 |             "--cassandra", default="0.0.0.0:9042", help="Cassandra's host:port.")
 45 |         my_parser.add_argument("--keyspace", default="apollo",
 46 |                                help="Cassandra's key space.")
 47 |         my_parser.add_argument(
 48 |             "--tables", help="Table name mapping (JSON): bags, hashes, hashtables, hashtables2.")
 49 | 
 50 |     def add_wmh_args(my_parser, params_help: str, add_hash_size: bool, required: bool):
 51 |         if add_hash_size:
 52 |             my_parser.add_argument("--size", type=int, default=128, help="Hash size.")
 53 |         my_parser.add_argument("-p", "--params", required=required, help=params_help)
 54 |         my_parser.add_argument("-t", "--threshold", required=required, type=float,
 55 |                                help="Jaccard similarity threshold.")
 56 |         my_parser.add_argument("--false-positive-weight", type=float, default=0.5,
 57 |                                help="Used to adjust the relative importance of "
 58 |                                     "minimizing false positives count when optimizing "
 59 |                                     "for the Jaccard similarity threshold.")
 60 |         my_parser.add_argument("--false-negative-weight", type=float, default=0.5,
 61 |                                help="Used to adjust the relative importance of "
 62 |                                     "minimizing false negatives count when optimizing "
 63 |                                     "for the Jaccard similarity threshold.")
 64 | 
 65 |     def add_template_args(my_parser, default_template):
 66 |         my_parser.add_argument("--batch", type=int, default=100,
 67 |                                help="Number of hashes to query at a time.")
 68 |         my_parser.add_argument("--template", default=default_template,
 69 |                                help="Jinja2 template to render.")
 70 | 
 71 |     # Create and construct subparsers
 72 |     subparsers = parser.add_subparsers(help="Commands", dest="command")
 73 | 
 74 |     # ------------------------------------------------------------------------
 75 |     warmup_parser = subparsers.add_parser(
 76 |         "warmup", help="Initialize source{d} engine.")
 77 |     warmup_parser.set_defaults(handler=warmup)
 78 |     add_engine_args(warmup_parser, default_packages=[CASSANDRA_PACKAGE])
 79 | 
 80 |     # ------------------------------------------------------------------------
 81 |     db_parser = subparsers.add_parser("resetdb", help="Destructively initialize the database.")
 82 |     db_parser.set_defaults(handler=reset_db)
 83 |     add_cassandra_args(db_parser)
 84 |     db_parser.add_argument(
 85 |         "--hashes-only", action="store_true",
 86 |         help="Only clear the tables: hashes, hashtables, hashtables2. Do not touch the rest.")
 87 |     # ------------------------------------------------------------------------
 88 |     preprocess_parser = subparsers.add_parser(
 89 |         "preprocess", help="Creates the index, quant and docfreq model of the bag-of-words model.")
 90 |     preprocess_parser.set_defaults(handler=preprocess)
 91 |     add_df_args(preprocess_parser)
 92 |     add_repo2_args(preprocess_parser)
 93 |     add_feature_args(preprocess_parser)
 94 |     add_repartitioner_arg(preprocess_parser)
 95 |     preprocess_parser.add_argument(
 96 |         "--cached-index-path", default=None,
 97 |         help="[OUT] Path to the docfreq model holding the document's index.")
 98 |     # ------------------------------------------------------------------------
 99 |     source2bags_parser = subparsers.add_parser(
100 |         "bags", help="Convert source code to weighted sets.")
101 |     source2bags_parser.set_defaults(handler=source2bags)
102 |     add_bow_args(source2bags_parser)
103 |     add_repo2_args(source2bags_parser, default_packages=[CASSANDRA_PACKAGE])
104 |     add_feature_args(source2bags_parser)
105 |     add_cassandra_args(source2bags_parser)
106 |     add_df_args(source2bags_parser)
107 |     add_repartitioner_arg(source2bags_parser)
108 |     source2bags_parser.add_argument(
109 |         "--cached-index-path", default=None,
110 |         help="[IN] Path to the docfreq model holding the document's index.")
111 | 
112 |     # ------------------------------------------------------------------------
113 |     hash_parser = subparsers.add_parser(
114 |         "hash", help="Run MinHashCUDA on the bag batches.")
115 |     hash_parser.set_defaults(handler=hash_batches)
116 |     hash_parser.add_argument("-i", "--input",
117 |                              help="Path to the directory with Parquet files.")
118 |     hash_parser.add_argument("--seed", type=int, default=int(time()),
119 |                              help="Random generator's seed.")
120 |     hash_parser.add_argument("--mhc-verbosity", type=int, default=1,
121 |                              help="MinHashCUDA logs verbosity level.")
122 |     hash_parser.add_argument("--devices", type=int, default=0,
123 |                              help="Or-red indices of NVIDIA devices to use. 0 means all.")
124 |     add_wmh_args(hash_parser, "Path to the output file with WMH parameters.", True, True)
125 |     add_cassandra_args(hash_parser)
126 |     add_spark_args(hash_parser, default_packages=[CASSANDRA_PACKAGE])
127 |     add_feature_weight_arg(hash_parser)
128 |     add_repartitioner_arg(hash_parser)
129 | 
130 |     # ------------------------------------------------------------------------
131 |     query_parser = subparsers.add_parser("query", help="Query for similar files.")
132 |     query_parser.set_defaults(handler=query)
133 |     mode_group = query_parser.add_mutually_exclusive_group(required=True)
134 |     mode_group.add_argument("-i", "--id", help="Query for this id (id mode).")
135 |     mode_group.add_argument("-c", "--file", help="Query for this file (file mode).")
136 |     query_parser.add_argument("--docfreq", help="Path to OrderedDocumentFrequencies (file mode).")
137 |     query_parser.add_argument("--min-docfreq", default=1, type=int,
138 |                               help="The minimum document frequency of each feature.")
139 |     query_parser.add_argument(
140 |         "--bblfsh", default="localhost:9432", help="Babelfish server's address.")
141 |     query_parser.add_argument("--precise", action="store_true",
142 |                               help="Calculate the precise set.")
143 |     add_wmh_args(query_parser, "Path to the Weighted MinHash parameters.", False, False)
144 |     add_feature_args(query_parser, required=False)
145 |     add_template_args(query_parser, "query.md.jinja2")
146 |     add_cassandra_args(query_parser)
147 | 
148 |     # ------------------------------------------------------------------------
149 |     cc_parser = subparsers.add_parser(
150 |         "cc", help="Load the similar pairs of files and run connected components analysis.")
151 |     cc_parser.set_defaults(handler=find_connected_components)
152 |     add_cassandra_args(cc_parser)
153 |     cc_parser.add_argument("-o", "--output", required=True,
154 |                            help="[OUT] Path to connected components ASDF model.")
155 | 
156 |     # ------------------------------------------------------------------------
157 |     dumpcc_parser = subparsers.add_parser(
158 |         "dumpcc", help="Output the connected components to stdout.")
159 |     dumpcc_parser.set_defaults(handler=dumpcc)
160 |     dumpcc_parser.add_argument("-i", "--input", required=True,
161 |                                help="Path to connected components ASDF model.")
162 |     # ------------------------------------------------------------------------
163 |     community_parser = subparsers.add_parser(
164 |         "cmd", help="Run Community Detection analysis on the connected components from \"cc\".")
165 |     community_parser.set_defaults(handler=detect_communities)
166 |     community_parser.add_argument("-i", "--input", required=True,
167 |                                   help="Path to connected components ASDF model.")
168 |     community_parser.add_argument("-o", "--output", required=True,
169 |                                   help="[OUT] Path to the communities ASDF model.")
170 |     community_parser.add_argument("--edges", choices=("linear", "quadratic", "1", "2"),
171 |                                   default="linear",
172 |                                   help="The method to generate the graph's edges: bipartite - "
173 |                                        "linear and fast, but may not fit some the CD algorithms, "
174 |                                        "or all to all within a bucket - quadratic and slow, but "
175 |                                        "surely fits all the algorithms.")
176 |     cmd_choices = [k[10:] for k in dir(Graph) if k.startswith("community_")]
177 |     community_parser.add_argument("-a", "--algorithm", choices=cmd_choices,
178 |                                   default="walktrap",
179 |                                   help="The community detection algorithm to apply.")
180 |     community_parser.add_argument("-p", "--params", type=json.loads, default={},
181 |                                   help="Parameters for the algorithm (**kwargs, JSON format).")
182 |     community_parser.add_argument("--no-spark", action="store_true", help="Do not use Spark.")
183 |     add_spark_args(community_parser)
184 | 
185 |     # ------------------------------------------------------------------------
186 |     dumpcmd_parser = subparsers.add_parser(
187 |         "dumpcmd", help="Output the detected communities to stdout.")
188 |     dumpcmd_parser.set_defaults(handler=dumpcmd)
189 |     dumpcmd_parser.add_argument("input", help="Path to the communities ASDF model.")
190 |     add_template_args(dumpcmd_parser, "report.md.jinja2")
191 |     add_cassandra_args(dumpcmd_parser)
192 | 
193 |     # ------------------------------------------------------------------------
194 |     evalcc_parser = subparsers.add_parser(
195 |         "evalcc", help="Evaluate the communities: calculate the precise similarity and the "
196 |                        "fitness metric.")
197 |     evalcc_parser.set_defaults(handler=evaluate_communities)
198 |     evalcc_parser.add_argument("-t", "--threshold", required=True, type=float,
199 |                                help="Jaccard similarity threshold.")
200 |     evalcc_parser.add_argument("-i", "--input", required=True,
201 |                                help="Path to the communities model.")
202 | 
203 |     add_spark_args(evalcc_parser, default_packages=[CASSANDRA_PACKAGE])
204 |     add_cassandra_args(evalcc_parser)
205 | 
206 |     # TODO: retable [.....] -> [.] [.] [.] [.] [.]
207 |     return parser
208 | 
209 | 
210 | def main():
211 |     """
212 |     Creates all the argument parsers and invokes the function from set_defaults().
213 | 
214 |     :return: The result of the function from set_defaults().
215 |     """
216 |     parser = get_parser()
217 |     args = parser.parse_args()
218 |     args.log_level = logging._nameToLevel[args.log_level]
219 |     setup_logging(args.log_level)
220 |     try:
221 |         handler = args.handler
222 |     except AttributeError:
223 |         def print_usage(_):
224 |             parser.print_usage()
225 | 
226 |         handler = print_usage
227 |     return handler(args)
228 | 
229 | 
230 | if __name__ == "__main__":
231 |     sys.exit(main())
232 | 


--------------------------------------------------------------------------------
/apollo/bags.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import Row
 2 | from sourced.ml.cmd import repos2bow_template, repos2bow_index_template
 3 | from sourced.ml.transformers import Transformer
 4 | 
 5 | from apollo import cassandra_utils
 6 | 
 7 | 
 8 | class BagsSaver(Transformer):
 9 |     def __init__(self, keyspace, table, **kwargs):
10 |         super().__init__(**kwargs)
11 |         self.keyspace = keyspace
12 |         self.table = table
13 | 
14 |     def __call__(self, head):
15 |         rows = head.map(lambda row: Row(sha1=row.document,
16 |                                         item=row.token,
17 |                                         value=float(row.value)))
18 |         if self.explained:
19 |             self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
20 |         rows.toDF() \
21 |             .write \
22 |             .format("org.apache.spark.sql.cassandra") \
23 |             .mode("append") \
24 |             .options(table=self.table, keyspace=self.keyspace) \
25 |             .save()
26 |         return head
27 | 
28 | 
29 | class MetadataSaver(Transformer):
30 |     def __init__(self, keyspace, table, **kwargs):
31 |         super().__init__(**kwargs)
32 |         self.keyspace = keyspace
33 |         self.table = table
34 | 
35 |     def __call__(self, head):
36 |         rows = head.map(lambda x: Row(
37 |             sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path))
38 |         if self.explained:
39 |             self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
40 |         rows.toDF() \
41 |             .write \
42 |             .format("org.apache.spark.sql.cassandra") \
43 |             .mode("append") \
44 |             .options(table=self.table, keyspace=self.keyspace) \
45 |             .save()
46 | 
47 | 
48 | def preprocess(args):
49 |     return repos2bow_index_template(args)
50 | 
51 | 
52 | def source2bags(args):
53 |     cassandra_utils.configure(args)
54 |     return repos2bow_template(
55 |         args,
56 |         cache_hook=lambda: MetadataSaver(args.keyspace, args.tables["meta"]),
57 |         save_hook=lambda: BagsSaver(args.keyspace, args.tables["bags"]))
58 | 


--------------------------------------------------------------------------------
/apollo/cassandra_utils.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import logging
  3 | import json
  4 | import platform
  5 | import re
  6 | from typing import Iterable
  7 | 
  8 | import modelforge.logs
  9 | from cassandra.cluster import Cluster, Session, NoHostAvailable
 10 | from cassandra.policies import RoundRobinPolicy
 11 | 
 12 | 
 13 | def patch_tables(args):
 14 |     if args.tables and isinstance(args.tables, str):
 15 |         tables = args.tables
 16 |     else:
 17 |         tables = ""
 18 |     defaults = ("bags", "meta", "hashes", "hashtables", "hashtables2")
 19 |     args.tables = {n: n for n in defaults}
 20 |     if tables:
 21 |         args.tables.update(json.loads(tables))
 22 | 
 23 | 
 24 | def configure(args):
 25 |     try:
 26 |         cas_host, cas_port = args.cassandra.split(":")
 27 |     except ValueError:
 28 |         cas_host = args.cassandra
 29 |         cas_port = "9042"
 30 |     args.config.append("spark.cassandra.connection.host=" + cas_host)
 31 |     args.config.append("spark.cassandra.connection.port=" + cas_port)
 32 |     patch_tables(args)
 33 |     return args
 34 | 
 35 | 
 36 | def get_db(args):
 37 |     log = logging.getLogger("cassandra")
 38 |     patch_tables(args)
 39 |     try:
 40 |         cas_host, cas_port = args.cassandra.split(":")
 41 |     except ValueError:
 42 |         cas_host = args.cassandra
 43 |         cas_port = "9042"
 44 | 
 45 |     def get_cluster():
 46 |         return Cluster((cas_host,), port=int(cas_port),
 47 |                        load_balancing_policy=RoundRobinPolicy())
 48 |     cluster = get_cluster()
 49 |     log.info("Connecting to %s", args.cassandra)
 50 |     try:
 51 |         session = cluster.connect(args.keyspace)
 52 |     except NoHostAvailable:
 53 |         log.warning("Keyspace %s does not exist", args.keyspace)
 54 |         cluster = get_cluster()
 55 |         session = cluster.connect()
 56 |     return session
 57 | 
 58 | 
 59 | def reset_db(args):
 60 |     db = get_db(args)
 61 | 
 62 |     def cql(cmd):
 63 |         print(cmd + ";")
 64 |         db.execute(cmd)
 65 | 
 66 |     if not args.hashes_only:
 67 |         cql("DROP KEYSPACE IF EXISTS %s" % args.keyspace)
 68 |         cql("CREATE KEYSPACE %s WITH REPLICATION = {"
 69 |             "'class' : 'SimpleStrategy', 'replication_factor' : 1}" % args.keyspace)
 70 |         print("USE %s;" % args.keyspace)
 71 |         db.set_keyspace(args.keyspace)
 72 |     tables = args.tables
 73 |     if not args.hashes_only:
 74 |         cql("CREATE TABLE %s (sha1 ascii, item ascii, value float, PRIMARY KEY (sha1, item))"
 75 |             % tables["bags"])
 76 |         cql("CREATE TABLE %s (sha1 varchar, repo varchar, commit ascii, path varchar, "
 77 |             "PRIMARY KEY (sha1, repo, commit, path))" % tables["meta"])
 78 |     else:
 79 |         cql("DROP TABLE IF EXISTS %s" % tables["hashes"])
 80 |         cql("DROP TABLE IF EXISTS %s" % tables["hashtables"])
 81 |         cql("DROP TABLE IF EXISTS %s" % tables["hashtables2"])
 82 |     cql("CREATE TABLE %s (sha1 varchar, value blob, PRIMARY KEY (sha1))" % tables["hashes"])
 83 |     cql("CREATE TABLE %s (sha1 varchar, hashtable tinyint, value blob, "
 84 |         "PRIMARY KEY (hashtable, value, sha1))" % tables["hashtables"])
 85 |     cql("CREATE TABLE %s (sha1 varchar, hashtable tinyint, value blob, "
 86 |         "PRIMARY KEY (sha1, hashtable))" % tables["hashtables2"])
 87 | 
 88 | 
 89 | class BatchedHashResolver:
 90 |     def __init__(self, hashes: Iterable, batch_size: int, session: Session, table: str):
 91 |         self.hashes = iter(hashes)
 92 |         self.batch_size = batch_size
 93 |         self.session = session
 94 |         self.table = table
 95 |         self.buffer = []
 96 |         self._log = logging.getLogger("BatchedHashResolver")
 97 | 
 98 |     def __next__(self):
 99 |         while True:
100 |             if not self.buffer:
101 |                 self._pump()
102 |             r = None
103 |             while r is None and self.buffer:
104 |                 r = self.buffer.pop()
105 |             if r is not None:
106 |                 return r
107 | 
108 |     def __iter__(self):
109 |         return self
110 | 
111 |     def _pump(self):
112 |         first_hash = next(self.hashes)
113 |         try:
114 |             fh, fm = first_hash
115 |             items = {h: (i, m) for i, (h, m) in zip(range(1, self.batch_size), self.hashes)}
116 |             items[fh] = 0, fm
117 |             meta = True
118 |         except ValueError:
119 |             items = {h: i for i, h in zip(range(1, self.batch_size), self.hashes)}
120 |             items[first_hash] = 0
121 |             meta = False
122 |         if not items:
123 |             raise StopIteration()
124 |         query = "select sha1, repo, commit, path from %s where sha1 in (%s)" % (
125 |             self.table, ",".join("'%s'" % h for h in items))
126 |         self._log.debug("%s in (%d)", query[:query.find(" in (")], len(items))
127 |         rows = self.session.execute(query)
128 |         buffer = self.buffer
129 |         buffer.extend(None for _ in items)
130 |         l = len(items)  # noqa
131 |         count = 0
132 |         for r in rows:
133 |             count += 1
134 |             if meta:
135 |                 i, m = items[r.sha1]
136 |             else:
137 |                 i = items[r.sha1]
138 |                 m = None
139 |             # reverse order - we will pop() in __next__
140 |             tr = r.sha1, (r.repo, r.commit, r.path)
141 |             buffer[l - i - 1] = (tr + (m,)) if meta else tr
142 |         self._log.debug("-> %d", count)
143 | 
144 | 
145 | class ColorFormatter(logging.Formatter):
146 |     """
147 |     logging Formatter which prints messages with colors.
148 |     """
149 |     GREEN_MARKERS = [" ok", "ok:", "finished", "completed", "ready",
150 |                      "done", "running", "success", "saved"]
151 |     GREEN_RE = re.compile("|".join(GREEN_MARKERS))
152 |     BEER_MUG = platform.uname().release.endswith("-moby")
153 |     FUR_TREE = datetime.now().month == 12 and datetime.now().day >= 8
154 | 
155 |     def formatMessage(self, record):
156 |         level_color = "0"
157 |         text_color = "0"
158 |         fmt = ""
159 |         if record.levelno <= logging.DEBUG:
160 |             fmt = "\033[0;37m" + logging.BASIC_FORMAT + "\033[0m"
161 |         elif record.levelno <= logging.INFO:
162 |             level_color = "1;36"
163 |             lmsg = record.message.lower()
164 |             if self.GREEN_RE.search(lmsg):
165 |                 text_color = "1;32"
166 |         elif record.levelno <= logging.WARNING:
167 |             level_color = "1;33"
168 |         elif record.levelno <= logging.CRITICAL:
169 |             level_color = "1;31"
170 |         if self.BEER_MUG:
171 |             spice = "🍺 "
172 |         elif self.FUR_TREE:
173 |             spice = "🎄 "
174 |         else:
175 |             spice = ""
176 |         if not fmt:
177 |             fmt = "\033[" + level_color + \
178 |                   "m" + spice + "%(levelname)s\033[0m:%(name)s:\033[" + text_color + \
179 |                   "m%(message)s\033[0m"
180 |         return fmt % record.__dict__
181 | 
182 | 
183 | modelforge.logs.ColorFormatter = ColorFormatter
184 | 


--------------------------------------------------------------------------------
/apollo/graph.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from itertools import chain
  3 | import logging
  4 | import os
  5 | import sys
  6 | from uuid import uuid4
  7 | 
  8 | from igraph import Graph
  9 | from modelforge import Model, merge_strings, split_strings, assemble_sparse_matrix, \
 10 |     disassemble_sparse_matrix, register_model
 11 | from modelforge.progress_bar import progress_bar
 12 | import numpy
 13 | from pyspark.sql.types import Row
 14 | from scipy.sparse import csr_matrix
 15 | from sourced.ml.utils import create_spark
 16 | from sourced.ml.extractors.helpers import filter_kwargs
 17 | 
 18 | from apollo.cassandra_utils import get_db, configure, BatchedHashResolver, Session
 19 | from apollo.query import weighted_jaccard, stream_template
 20 | 
 21 | 
 22 | @register_model
 23 | class ConnectedComponentsModel(Model):
 24 |     """
 25 |     Model to store the connected components.
 26 |     """
 27 |     NAME = "connected_components"
 28 | 
 29 |     def construct(self, connected_components, element_to_buckets, element_to_id):
 30 |         self.id_to_cc = numpy.zeros(len(element_to_id), dtype=numpy.uint32)
 31 |         for cc, ids in connected_components.items():
 32 |             for id_ in ids:
 33 |                 self.id_to_cc[id_] = cc
 34 |         self.id_to_element = [None] * len(element_to_id)
 35 |         for k, v in element_to_id.items():
 36 |             self.id_to_element[v] = k
 37 |         data = numpy.ones(sum(map(len, element_to_buckets)), dtype=numpy.uint8)
 38 |         indices = numpy.zeros(len(data), dtype=numpy.uint32)
 39 |         indptr = numpy.zeros(len(element_to_buckets) + 1, dtype=numpy.uint32)
 40 |         pos = 0
 41 |         for i, element in enumerate(element_to_buckets):
 42 |             indices[pos:(pos + len(element))] = element
 43 |             pos += len(element)
 44 |             indptr[i + 1] = pos
 45 |         self.id_to_buckets = csr_matrix((data, indices, indptr))
 46 |         return self
 47 | 
 48 |     def _load_tree(self, tree):
 49 |         self.id_to_cc = tree["cc"]
 50 |         self.id_to_cc[0]  # do not remove - loads the array from disk
 51 |         self.id_to_element = split_strings(tree["elements"])
 52 |         self.id_to_buckets = assemble_sparse_matrix(tree["buckets"])
 53 | 
 54 |     def dump(self):
 55 |         return "Number of connected components: %s\nNumber of unique elements: %s" % (
 56 |             len(numpy.unique(self.id_to_cc)), len(self.id_to_element))
 57 | 
 58 |     def _generate_tree(self):
 59 |         return {"cc": self.id_to_cc, "elements": merge_strings(self.id_to_element),
 60 |                 "buckets": disassemble_sparse_matrix(self.id_to_buckets)}
 61 | 
 62 | 
 63 | def _find_connected_component(buckets, element_to_buckets):
 64 |     """
 65 |     Find connected components among buckets.
 66 |     :param buckets: list of buckets where each bucket contains list of elements
 67 |     :param element_to_buckets: mapping from element to list of buckets where it appears
 68 |     :return: mapping from connected component to set of elements in it
 69 |     """
 70 |     unvisited_buckets = set(range(len(buckets)))
 71 |     connected_components_element = defaultdict(set)
 72 | 
 73 |     cc_id = 0  # connected component counter
 74 |     while unvisited_buckets:
 75 |         pending = {unvisited_buckets.pop()}
 76 |         while pending:
 77 |             bucket = pending.pop()
 78 |             elements = buckets[bucket]
 79 |             connected_components_element[cc_id].update(elements)
 80 |             for element in elements:
 81 |                 element_buckets = element_to_buckets[element]
 82 |                 for b in element_buckets:
 83 |                     if b in unvisited_buckets:
 84 |                         pending.add(b)
 85 |                         unvisited_buckets.remove(b)
 86 |         # increase number of connected components
 87 |         cc_id += 1
 88 |     return connected_components_element
 89 | 
 90 | 
 91 | def find_connected_components(args):
 92 |     log = logging.getLogger("graph")
 93 |     session = get_db(args)
 94 |     table = args.tables["hashtables"]
 95 |     rows = session.execute("SELECT DISTINCT hashtable FROM %s" % table)
 96 |     hashtables = sorted(r.hashtable for r in rows)
 97 |     log.info("Detected %d hashtables", len(hashtables))
 98 | 
 99 |     # Read buckets from database
100 |     buckets = []
101 |     element_ids = {}
102 |     prev_len = 0
103 |     for hashtable in hashtables:
104 |         rows = session.execute(
105 |             "SELECT sha1, value FROM %s WHERE hashtable=%d" % (table, hashtable))
106 |         band = None
107 |         bucket = []
108 |         for row in rows:
109 |             eid = element_ids.setdefault(row.sha1, len(element_ids))
110 |             if row.value != band:
111 |                 if band is not None:
112 |                     buckets.append(bucket.copy())
113 |                     bucket.clear()
114 |                 band = row.value
115 |                 bucket.append(eid)
116 |                 continue
117 |             bucket.append(eid)
118 |         if bucket:
119 |             buckets.append(bucket)
120 |         log.info("Fetched %d, %d buckets", hashtable, len(buckets) - prev_len)
121 |         prev_len = len(buckets)
122 | 
123 |     element_to_buckets = [[] for _ in range(len(element_ids))]
124 |     for i, bucket in enumerate(buckets):
125 |         for element in bucket:
126 |             element_to_buckets[element].append(i)
127 | 
128 |     # Statistics about buckets
129 |     levels = (logging.ERROR, logging.INFO)
130 |     log.info("Number of buckets: %d", len(buckets))
131 |     log.log(levels[len(element_ids) >= len(buckets[0])],
132 |             "Number of elements: %d", len(element_ids))
133 |     epb = sum(map(len, buckets)) / len(buckets)
134 |     log.log(levels[epb >= 1], "Average number of elements per bucket: %.1f", epb)
135 |     nb = min(map(len, element_to_buckets))
136 |     log.log(levels[nb == len(hashtables)], "Min number of buckets per element: %s", nb)
137 |     nb = max(map(len, element_to_buckets))
138 |     log.log(levels[nb == len(hashtables)], "Max number of buckets per element: %s", nb)
139 |     log.info("Running CC analysis")
140 | 
141 |     # Connect components
142 |     connected_components_element = _find_connected_component(buckets, element_to_buckets)
143 |     log.info("CC number: %d", len(connected_components_element))
144 | 
145 |     log.info("Writing %s", args.output)
146 |     ConnectedComponentsModel() \
147 |         .construct(connected_components_element, element_to_buckets, element_ids) \
148 |         .save(args.output)
149 | 
150 | 
151 | def dumpcc(args):
152 |     model = ConnectedComponentsModel().load(args.input)
153 |     ccs = defaultdict(list)
154 |     for i, cc in enumerate(model.id_to_cc):
155 |         ccs[cc].append(i)
156 |     for _, cc in sorted(ccs.items()):
157 |         print(" ".join(model.id_to_element[i] for i in cc))
158 | 
159 | 
160 | @register_model
161 | class CommunitiesModel(Model):
162 |     """
163 |     Model to store the node communities.
164 |     """
165 |     NAME = "communities"
166 | 
167 |     def construct(self, communities, id_to_element):
168 |         self.communities = communities
169 |         self.id_to_element = id_to_element
170 |         return self
171 | 
172 |     def _load_tree(self, tree):
173 |         self.id_to_element = split_strings(tree["elements"])
174 |         data, indptr = tree["data"], tree["indptr"]
175 |         self.communities = [data[i:j] for i, j in zip(indptr, indptr[1:])]
176 | 
177 |     def _generate_tree(self):
178 |         size = sum(map(len, self.communities))
179 |         data = numpy.zeros(size, dtype=numpy.uint32)
180 |         indptr = numpy.zeros(len(self.communities) + 1, dtype=numpy.int64)
181 |         pos = 0
182 |         for i, community in enumerate(self.communities):
183 |             data[pos:pos + len(community)] = community
184 |             pos += len(community)
185 |             indptr[i + 1] = pos
186 |         return {"data": data, "indptr": indptr, "elements": merge_strings(self.id_to_element)}
187 | 
188 |     def dump(self):
189 |         return "Number of communities: %s" % (len(self.communities))
190 | 
191 |     def count_elements(self):
192 |         return sum(sum(1 for i in c if i < len(self.id_to_element)) for c in self.communities)
193 | 
194 | 
195 | def detect_communities(args):
196 |     log = logging.getLogger("cmd")
197 |     ccsmodel = ConnectedComponentsModel().load(args.input)
198 |     log.info("Building the connected components")
199 |     ccs = defaultdict(list)
200 |     for i, c in enumerate(ccsmodel.id_to_cc):
201 |         ccs[c].append(i)
202 |     buckmat = ccsmodel.id_to_buckets
203 |     buckindices = buckmat.indices
204 |     buckindptr = buckmat.indptr
205 |     total_nvertices = buckmat.shape[0]
206 |     linear = args.edges in ("linear", "1")
207 |     graphs = []
208 |     communities = []
209 |     if not linear:
210 |         log.info("Transposing the matrix")
211 |         buckmat_csc = buckmat.T.tocsr()
212 |     fat_ccs = []
213 |     for vertices in ccs.values():
214 |         if len(vertices) == 1:
215 |             continue
216 |         if len(vertices) == 2:
217 |             communities.append(vertices)
218 |             continue
219 |         fat_ccs.append(vertices)
220 |     log.info("Building %d graphs", len(fat_ccs))
221 |     for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
222 |         if linear:
223 |             edges = []
224 |             weights = []
225 |             bucket_weights = buckmat.sum(axis=0)
226 |             buckets = set()
227 |             for i in vertices:
228 |                 for j in range(buckindptr[i], buckindptr[i + 1]):
229 |                     bucket = buckindices[j]
230 |                     weights.append(bucket_weights[0, bucket])
231 |                     bucket += total_nvertices
232 |                     buckets.add(bucket)
233 |                     edges.append((str(i), str(bucket)))
234 |         else:
235 |             edges = set()
236 |             weights = None
237 |             buckets = set()
238 |             for i in vertices:
239 |                 for j in range(buckindptr[i], buckindptr[i + 1]):
240 |                     buckets.add(buckindices[j])
241 |             for bucket in buckets:
242 |                 buckverts = \
243 |                     buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
244 |                 for i, x in enumerate(buckverts):
245 |                     for y in buckverts:
246 |                         if x < y:
247 |                             edges.add((str(x), str(y)))
248 |             buckets.clear()
249 |             edges = list(edges)
250 |         graph = Graph(directed=False)
251 |         graph.add_vertices(list(map(str, vertices + list(buckets))))
252 |         graph.add_edges(edges)
253 |         graph.edge_weights = weights
254 |         graphs.append(graph)
255 |     log.info("Launching the community detection")
256 |     detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
257 |     if not args.no_spark:
258 |         spark = create_spark(
259 |             "cmd-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark)).sparkContext
260 |         communities.extend(spark.parallelize(graphs).flatMap(detector).collect())
261 |     else:
262 |         communities.extend(chain.from_iterable(progress_bar(
263 |             (detector(g) for g in graphs), log, expected_size=len(graphs))))
264 |     log.info("Overall communities: %d", len(communities))
265 |     log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities]))
266 |     log.info("Median community size: %.1f", numpy.median([len(c) for c in communities]))
267 |     log.info("Max community size: %d", max(map(len, communities)))
268 |     log.info("Writing %s", args.output)
269 |     CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
270 | 
271 | 
272 | class CommunityDetector:
273 |     def __init__(self, algorithm, config):
274 |         self.algorithm = algorithm
275 |         self.config = config
276 | 
277 |     def __call__(self, graph):
278 |         action = getattr(graph, "community_" + self.algorithm)
279 |         if self.algorithm == "infomap":
280 |             kwargs = {"edge_weights": graph.edge_weights}
281 |         elif self.algorithm == "leading_eigenvector_naive":
282 |             kwargs = {}
283 |         else:
284 |             kwargs = {"weights": graph.edge_weights}
285 |         if self.algorithm == "edge_betweenness":
286 |             kwargs["directed"] = False
287 |         # TODO: Rollback to action(**kwargs, **self.config) when support for Python3.4 is over
288 |         kwargs.update(self.config)
289 |         result = action(**kwargs)
290 |         if hasattr(result, "as_clustering"):
291 |             result = result.as_clustering()
292 | 
293 |         output = [[] for _ in range(len(result.sizes()))]
294 |         for i, memb in enumerate(result.membership):
295 |             output[memb].append(int(graph.vs[i]["name"]))
296 | 
297 |         return output
298 | 
299 | 
300 | class BatchedCommunityResolver:
301 |     def __init__(self, model: CommunitiesModel, batch_size: int, session: Session, table: str):
302 |         self._log = logging.getLogger("BatchedCommunityResolver")
303 |         self.resolver = progress_bar(
304 |             BatchedHashResolver(self._gen_hashes(model), batch_size, session, table),
305 |             self._log, expected_size=model.count_elements()
306 |         )
307 |         self._prev = None, None, None
308 | 
309 |     def __next__(self):
310 |         pci = self._prev[-1]
311 |         com = [self._prev[:-1]] if pci is not None else []
312 |         for sha1, info, ci in self.resolver:
313 |             if pci is None:
314 |                 pci = ci
315 |             if pci == ci:
316 |                 com.append((sha1, info))
317 |             else:
318 |                 self._prev = sha1, info, ci
319 |                 if len(com) > 1:
320 |                     return com
321 |         if com and pci is not None:
322 |             self._prev = None, None, None
323 |             if len(com) > 1:
324 |                 return com
325 |         raise StopIteration()
326 | 
327 |     def __iter__(self):
328 |         return self
329 | 
330 |     def _gen_hashes(self, model):
331 |         id_to_element = model.id_to_element
332 |         for i, community in enumerate(model.communities):
333 |             for j in community:
334 |                 try:
335 |                     yield id_to_element[j].split("@")[1], i
336 |                 except IndexError:
337 |                     continue
338 | 
339 | 
340 | def dumpcmd(args):
341 |     log = logging.getLogger("dumpcmd")
342 |     model = CommunitiesModel().load(args.input)
343 |     log.info("Initializing the sha1 resolver")
344 |     communities = BatchedCommunityResolver(model, args.batch, get_db(args), args.tables["meta"])
345 |     stream_template(args.template, sys.stdout, communities=communities, model=model,
346 |                     model_path=os.path.abspath(args.input))
347 | 
348 | 
349 | class CommunityEvaluator:
350 |     def __init__(self, threshold, vocabulary_size):
351 |         self.threshold = threshold
352 |         self.vocabulary_size = vocabulary_size
353 | 
354 |     def __call__(self, community):
355 |         cid, contents = community
356 |         elements = defaultdict(list)
357 |         for t in contents:
358 |             elements[t[0]].append(t[1:])
359 |         if len(elements) == 1:
360 |             return (0,) * 4
361 |         for key, vals in elements.items():
362 |             vec = numpy.zeros(self.vocabulary_size, dtype=numpy.float32)
363 |             for i, w in vals:
364 |                 vec[i] = w
365 |             elements[key] = vec
366 |         misses = 0
367 |         loss = 0
368 |         for x, e1 in elements.items():
369 |             for y, e2 in elements.items():
370 |                 if x >= y:
371 |                     continue
372 |                 sim = weighted_jaccard(e1, e2)
373 |                 if sim < self.threshold:
374 |                     loss += (sim - self.threshold) ** 2
375 |                     misses += 1
376 |         count = len(elements) * (len(elements) - 1) / 2
377 |         return misses, misses / count, loss, loss / count
378 | 
379 | 
380 | def evaluate_communities(args):
381 |     log = logging.getLogger("evalcc")
382 |     model = CommunitiesModel().load(args.input)
383 |     configure(args)
384 |     spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark))
385 |     log.info("Preparing the communities' RDD")
386 |     items = []
387 |     for i, c in progress_bar(enumerate(model.communities), log,
388 |                              expected_size=len(model.communities)):
389 |         for m in c:
390 |             if m < len(model.id_to_element):
391 |                 items.append(Row(sha1=model.id_to_element[m], community=i))
392 |     log.info("Running")
393 |     items_in_spark = spark.sparkContext.parallelize(items).toDF()
394 |     bags = spark \
395 |         .read \
396 |         .format("org.apache.spark.sql.cassandra") \
397 |         .options(table=args.tables["bags"], keyspace=args.keyspace) \
398 |         .load()
399 |     log.info("Loaded the bags, calculating the vocabulary")
400 |     vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
401 |     vocabulary = {v: i for i, v in enumerate(vocabulary)}
402 |     log.info("Vocabulary size: %d", len(vocabulary))
403 |     element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
404 |     metrics = items_in_spark.join(bags, "sha1").rdd \
405 |         .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
406 |         .groupByKey() \
407 |         .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
408 |         .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
409 |     log.info("Total misses: %d", metrics[0])
410 |     log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
411 |     log.info("Total loss: %f", metrics[2])
412 |     log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
413 | 


--------------------------------------------------------------------------------
/apollo/hasher.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from uuid import uuid4
  4 | 
  5 | from bblfsh import BblfshClient
  6 | from modelforge.model import Model
  7 | from modelforge.models import register_model
  8 | import numpy
  9 | from pyspark.sql.types import Row
 10 | from scipy.integrate import quad as integrate
 11 | from sourced.ml.models import OrderedDocumentFrequencies
 12 | from sourced.ml.utils import create_spark
 13 | from sourced.ml.transformers.bow_writer import BOWLoader
 14 | from sourced.ml.extractors import __extractors__
 15 | from sourced.ml.extractors.helpers import filter_kwargs
 16 | from sourced.ml.algorithms import log_tf_log_idf
 17 | 
 18 | from apollo import cassandra_utils
 19 | 
 20 | #####################################################################################
 21 | # Begin code from https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py #
 22 | #####################################################################################
 23 | 
 24 | 
 25 | def _false_positive_probability(threshold, b, r):
 26 |     def _probability(s):
 27 |         return 1 - (1 - s**float(r))**float(b)
 28 |     a, err = integrate(_probability, 0.0, threshold)
 29 |     return a
 30 | 
 31 | 
 32 | def _false_negative_probability(threshold, b, r):
 33 |     def _probability(s):
 34 |         return 1 - (1 - (1 - s**float(r))**float(b))
 35 |     a, err = integrate(_probability, threshold, 1.0)
 36 |     return a
 37 | 
 38 | 
 39 | def calc_hashtable_params(threshold, sample_size, false_positive_weight=0.5,
 40 |                           false_negative_weight=0.5):
 41 |     """
 42 |     Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
 43 |     of probabilities of false positive and false negative.
 44 | 
 45 |     :return: tuple(number of hashtables, size of each band).
 46 |     """
 47 |     min_error = float("inf")
 48 |     opt = (0, 0)
 49 |     for b in range(1, sample_size + 1):
 50 |         max_r = int(sample_size / b)
 51 |         for r in range(1, max_r+1):
 52 |             fp = _false_positive_probability(threshold, b, r)
 53 |             fn = _false_negative_probability(threshold, b, r)
 54 |             error = fp*false_positive_weight + fn*false_negative_weight
 55 |             if error < min_error:
 56 |                 min_error = error
 57 |                 opt = (b, r)
 58 |     return opt
 59 | 
 60 | 
 61 | #####################################################################################
 62 | # End code from https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py   #
 63 | #####################################################################################
 64 | 
 65 | 
 66 | @register_model
 67 | class WeightedMinHashParameters(Model):
 68 |     """
 69 |     The randomly generated parameters of the Weighted MinHash-er.
 70 |     """
 71 |     NAME = "wmhparams"
 72 | 
 73 |     def construct(self, rs, ln_cs, betas):
 74 |         self.rs = rs
 75 |         self.ln_cs = ln_cs
 76 |         self.betas = betas
 77 |         rs[0] + ln_cs[0] + betas[0]  # do not remove - this loads the arrays from disk
 78 |         return self
 79 | 
 80 |     def _load_tree(self, tree):
 81 |         self.construct(rs=tree["rs"], ln_cs=tree["ln_cs"], betas=tree["betas"])
 82 | 
 83 |     def dump(self):
 84 |         return """Shape: %s""" % (self.rs.shape,)
 85 | 
 86 |     def _generate_tree(self):
 87 |         return {"rs": self.rs, "ln_cs": self.ln_cs, "betas": self.betas}
 88 | 
 89 | 
 90 | class HashExploder:
 91 |     def __init__(self, htnum, band_size):
 92 |         self.htnum = htnum
 93 |         self.band_size = band_size
 94 | 
 95 |     def __call__(self, record):
 96 |         key, wmh = record
 97 |         for hti in range(self.htnum):
 98 |             yield Row(sha1=key, hashtable=hti,
 99 |                       value=bytearray(wmh[hti * self.band_size:(hti + 1) * self.band_size].data))
100 | 
101 | 
102 | def modify_feature_weights(batches, arguments, **kwargs):
103 |     extractors = {}
104 |     for ex in __extractors__.values():
105 |         if "%s_weight" % ex.NAME in dir(arguments) and \
106 |                         getattr(arguments, "%s_weight" % ex.NAME) != 1:
107 |             extractors[ex.NAME] = (ex.NAMESPACE, getattr(arguments, "%s_weight" % ex.NAME))
108 | 
109 |     if not extractors:
110 |         return batches
111 | 
112 |     err = "You must specify location of docfreq file to modify weights of features"
113 |     assert arguments.docfreq is not None, err
114 |     assert os.path.isfile(arguments.docfreq), "docfreq should be a file"
115 | 
116 |     model = OrderedDocumentFrequencies().load(arguments.docfreq)
117 |     feature_mapping = model.order
118 | 
119 |     voc_size = batches[0].matrix.shape[-1]
120 |     weights = numpy.ones((voc_size,))
121 | 
122 |     for ext in extractors:
123 |         namespace = extractors[ext][0]
124 |         ind = [feature_mapping[k] for k in feature_mapping if k.startswith(namespace)]
125 |         weights[ind] = extractors[ext][1]
126 | 
127 |     for batch in batches:
128 |         # hack to modify attribute in namedtuple
129 |         batch.matrix.data = batch.matrix.multiply(weights).tocsr().data.astype(numpy.float32)
130 | 
131 |     return batches
132 | 
133 | 
134 | def hash_batches(args):
135 |     log = logging.getLogger("hash")
136 |     log.info("Loading files from %s", args.input)
137 |     loader = BOWLoader(args.input)
138 |     log.info("%d batches", len(loader))
139 | 
140 |     # Check batches
141 |     if not loader:
142 |         return
143 | 
144 |     htnum, band_size = calc_hashtable_params(
145 |         args.threshold, args.size, args.false_positive_weight, args.false_negative_weight)
146 |     log.info("Number of hash tables: %d", htnum)
147 |     log.info("Band size: %d", band_size)
148 |     cassandra_utils.configure(args)
149 |     spark_args = filter_kwargs(args.__dict__, create_spark)
150 |     spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
151 |     import libMHCUDA  # delayed import which requires CUDA and friends
152 |     tables = args.tables
153 |     gen = voc_size = None
154 |     try:
155 |         for i, bow in enumerate(loader):
156 |             if voc_size is None:
157 |                 voc_size = bow.matrix.shape[-1]
158 |                 log.info("Initializing the generator")
159 |                 deferred = os.path.isfile(args.params)
160 |                 gen = libMHCUDA.minhash_cuda_init(
161 |                     voc_size, args.size, seed=args.seed, devices=args.devices,
162 |                     verbosity=args.mhc_verbosity,
163 |                     deferred=deferred)
164 |                 if deferred:
165 |                     model = WeightedMinHashParameters().load(args.params)
166 |                     libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas)
167 |                 else:
168 |                     log.info("Writing %s", args.params)
169 |                     params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
170 |                     WeightedMinHashParameters().construct(*params).save(args.params)
171 |             if bow.matrix.shape[-1] != voc_size:
172 |                 raise ValueError("The vocabulary sizes do not match: %d != %d"
173 |                                  % (bow.matrix.shape[-1], voc_size))
174 |             log.info("Processing batch %d / %d", i + 1, len(loader))
175 |             # Modify features if needed
176 |             # TODO(vmarkovtsev): port to the new structure
177 |             # batches = modify_feature_weights(batches, args)
178 |             hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
179 |             job = [(k, h) for k, h in zip(bow.documents, hashes)]
180 |             log.info("Saving the hashtables")
181 |             df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)) \
182 |                 .coalesce(args.partitions, args.shuffle) \
183 |                 .toDF()
184 |             df.write \
185 |                 .format("org.apache.spark.sql.cassandra") \
186 |                 .mode("append") \
187 |                 .options(table=tables["hashtables"], keyspace=args.keyspace) \
188 |                 .save()
189 |             df.write \
190 |                 .format("org.apache.spark.sql.cassandra") \
191 |                 .mode("append") \
192 |                 .options(table=tables["hashtables2"], keyspace=args.keyspace) \
193 |                 .save()
194 |             log.info("Saving the hashes")
195 |             spark.parallelize(job) \
196 |                 .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
197 |                 .coalesce(args.partitions, args.shuffle) \
198 |                 .toDF() \
199 |                 .write \
200 |                 .format("org.apache.spark.sql.cassandra") \
201 |                 .mode("append") \
202 |                 .options(table=tables["hashes"], keyspace=args.keyspace) \
203 |                 .save()
204 |     finally:
205 |         libMHCUDA.minhash_cuda_fini(gen)
206 | 
207 | 
208 | def hash_file(args):
209 |     if not args.feature:
210 |         raise ValueError("extractors must not be empty")
211 |     log = logging.getLogger("hash_file")
212 |     vocab = OrderedDocumentFrequencies().load(args.docfreq)
213 |     params = WeightedMinHashParameters().load(args.params)
214 |     log.info("Extracting UAST from %s", args.file)
215 |     uast = BblfshClient(args.bblfsh).parse(args.file).uast
216 |     log.info("Populating the bag")
217 |     extractors = [__extractors__[s](
218 |         args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args))
219 |         for s in args.feature]
220 |     bag = numpy.zeros(len(vocab), dtype=numpy.float32)
221 |     for ex in extractors:
222 |         ex.ndocs = vocab.docs
223 |         ex.docfreq = vocab
224 |         for k, v in ex.extract(uast):
225 |             try:
226 |                 i = vocab.order[k]
227 |                 bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs)
228 |             except KeyError:
229 |                 continue
230 | 
231 |     log.info("Bag size: %d", len(bag.nonzero()[0]))
232 |     log.info("Hashing")
233 | 
234 |     return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag
235 | 
236 | 
237 | def weighted_minhash(v, sample_size, rs, ln_cs, betas):
238 |     if sample_size != rs.shape[0]:
239 |         raise ValueError("Input sample size mismatch, expecting %d" % rs.shape[0])
240 |     if len(v) != rs.shape[1]:
241 |         raise ValueError("Input dimension mismatch, expecting %d" % rs.shape[1])
242 | 
243 |     hashvalues = numpy.zeros((sample_size, 2), dtype=numpy.uint32)
244 |     vzeros = (v == 0)
245 |     if vzeros.all():
246 |         raise ValueError("Input is all zeros")
247 |     v[vzeros] = numpy.nan
248 |     vlog = numpy.log(v)
249 |     v[vzeros] = 0
250 |     for i in range(sample_size):
251 |         t = numpy.floor((vlog / rs[i]) + betas[i])
252 |         ln_y = (t - betas[i]) * rs[i]
253 |         ln_a = ln_cs[i] - ln_y - rs[i]
254 |         k = numpy.nanargmin(ln_a)
255 |         hashvalues[i][0], hashvalues[i][1] = k, int(t[k])
256 |     return hashvalues
257 | 


--------------------------------------------------------------------------------
/apollo/query.md.jinja2:
--------------------------------------------------------------------------------
 1 | # Similar to {{ origin }}
 2 | 
 3 | Size: {{ size }}
 4 | 
 5 | | SHA1 | Repository | Commit | Path |
 6 | |:----:|:-----------|:-------|:-----|
 7 | {% for sha1, (repo, commit, path) in items | sort %}
 8 | | `{{ sha1 }}` | [{{ repo.rsplit(".git")[0] }}](https://{{ repo.rsplit(".git")[0] }}) | `{{ commit[:8] }}` | [{{ path }}]({{ format_url(repo, commit, path) }}) |
 9 | {% endfor %}
10 | 


--------------------------------------------------------------------------------
/apollo/query.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import logging
  3 | import os
  4 | import sys
  5 | 
  6 | import jinja2
  7 | import numpy
  8 | from sourced.ml.models import OrderedDocumentFrequencies
  9 | 
 10 | from apollo.cassandra_utils import get_db, BatchedHashResolver
 11 | from apollo.hasher import hash_file, calc_hashtable_params
 12 | 
 13 | 
 14 | def query(args):
 15 |     log = logging.getLogger("query")
 16 |     session = get_db(args)
 17 |     tables = args.tables
 18 |     if args.id:
 19 |         rows = session.execute(
 20 |             "SELECT hashtable, value FROM %s WHERE sha1='%s'" % (tables["hashtables2"], args.id))
 21 |         bands = [(r.hashtable, r.value) for r in rows]
 22 |     else:
 23 |         # args.file
 24 |         if not args.feature:
 25 |             log.critical("-f / --feature must be specified at least once in file query mode")
 26 |             return 1
 27 |         if not args.params:
 28 |             log.critical("-p / --params must be specified in file query mode")
 29 |             return 1
 30 |         wmh, bag = hash_file(args)
 31 |         htnum, band_size = calc_hashtable_params(
 32 |             args.threshold, len(wmh), args.false_positive_weight, args.false_negative_weight)
 33 |         log.info("Number of hash tables: %d", htnum)
 34 |         log.info("Band size: %d", band_size)
 35 |         bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data))
 36 |                  for i in range(htnum)]
 37 |     similar = set()
 38 |     log.info("Looking for similar items")
 39 |     for i, band in bands:
 40 |         rows = session.execute(
 41 |             "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s"
 42 |             % (tables["hashtables"], i, codecs.encode(band, "hex").decode()))
 43 |         similar.update(r.sha1 for r in rows)
 44 |     log.info("Fetched %d items", len(similar))
 45 |     if args.precise:
 46 |         # Precise bags
 47 |         vocab = OrderedDocumentFrequencies().load(args.docfreq)
 48 |         log.info("Calculating the precise result")
 49 |         if args.id:
 50 |             rows = session.execute(
 51 |                 "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], args.id))
 52 |             bag = numpy.zeros(len(vocab), dtype=numpy.float32)
 53 |             for row in rows:
 54 |                 bag[vocab.order[row.item]] = row.value
 55 |         # Fetch other bags from the DB
 56 |         precise = []
 57 |         for x in similar:
 58 |             rows = session.execute(
 59 |                 "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], x))
 60 |             other_bag = numpy.zeros(len(vocab), dtype=numpy.float32)
 61 |             for row in rows:
 62 |                 other_bag[vocab.order[row.item]] = row.value
 63 |             if weighted_jaccard(bag, other_bag) >= args.threshold:
 64 |                 precise.append(x)
 65 |             log.info("Survived: %.2f", len(precise) / len(similar))
 66 |         similar = precise
 67 |     if args.id:
 68 |         try:
 69 |             similar.remove(args.id)
 70 |         except KeyError:
 71 |             # o_O
 72 |             pass
 73 | 
 74 |     similar = [s.split("@")[1] for s in similar]
 75 |     stream_template(args.template, sys.stdout, size=len(similar),
 76 |                     origin=args.id if args.id else os.path.abspath(args.file),
 77 |                     items=BatchedHashResolver(similar, args.batch, session, tables["meta"]))
 78 | 
 79 | 
 80 | def weighted_jaccard(vec1, vec2):
 81 |     return numpy.minimum(vec1, vec2).sum() / numpy.maximum(vec1, vec2).sum()
 82 | 
 83 | 
 84 | def format_url(repo, commit, path):
 85 |     if repo.endswith(".git"):
 86 |         repo = repo[:-4]
 87 |     if repo.startswith("github.com") or repo.startswith("gitlab.com"):
 88 |         return "https://%s/blob/%s/%s" % (repo, commit, path)
 89 |     if repo.startswith("bitbucket.org"):
 90 |         return "https://%s/src/%s/%s" % (repo, commit, path)
 91 |     return "[%s %s %s]" % (repo, commit, path)
 92 | 
 93 | 
 94 | def stream_template(name, dest, **kwargs):
 95 |     log = logging.getLogger("jinja2")
 96 |     log.info("Loading the template")
 97 |     loader = jinja2.FileSystemLoader(("/", os.path.dirname(__file__), os.getcwd()),
 98 |                                      followlinks=True)
 99 |     env = jinja2.Environment(
100 |         trim_blocks=True,
101 |         lstrip_blocks=True,
102 |         keep_trailing_newline=False,
103 |     )
104 |     template = loader.load(env, name)
105 |     log.info("Rendering")
106 |     template.stream(format_url=format_url, **kwargs).dump(dest)
107 | 


--------------------------------------------------------------------------------
/apollo/report.md.jinja2:
--------------------------------------------------------------------------------
 1 | # Code similarity report
 2 | 
 3 | ### Model
 4 | Path: `{{ model_path }}`
 5 | 
 6 | Items: {{ model.id_to_element | length }}
 7 | 
 8 | Cardinality: {{ model.communities | length }}
 9 | 
10 | ### Communities
11 | {% for com in communities %}
12 | 
13 | | SHA1 | Repository | Commit | Path |
14 | |:----:|:-----------|:-------|:-----|
15 |     {% for sha1, (repo, commit, path) in com | sort %}
16 | | `{{ sha1 }}` | [{{ repo.rsplit(".git")[0] }}](https://{{ repo.rsplit(".git")[0] }}) | `{{ commit[:8] }}` | [{{ path }}]({{ format_url(repo, commit, path) }}) |
17 |     {% endfor %}
18 | {% endfor %}
19 | 


--------------------------------------------------------------------------------
/apollo/warmup.py:
--------------------------------------------------------------------------------
1 | from sourced.ml.extractors.helpers import filter_kwargs
2 | from sourced.ml.utils import create_engine
3 | 
4 | 
5 | def warmup(args):
6 |     engine_args = filter_kwargs(args.__dict__, create_engine)
7 |     create_engine("warmup", "/tmp", **engine_args)
8 | 


--------------------------------------------------------------------------------
/doc/101.md:
--------------------------------------------------------------------------------
 1 | # Brief guide into finding similar source code with Apollo
 2 | 
 3 | ### Environment
 4 | 
 5 | [Babelfish must be running and have Java driver installed.](https://doc.bblf.sh/user/getting-started.html)
 6 | Cassandra or ScyllaDB must be running.
 7 | 
 8 | ### Prepare the source code
 9 | 
10 | Apollo works with Git repositories stored in [Siva](https://github.com/src-d/go-siva) format.
11 | Refer to [Borges](https://github.com/src-d/borges). We expect that the files will be in `/data` below.
12 | 
13 | ### Extract the features
14 | 
15 | We convert every file into a [weighted set of features](https://en.wikipedia.org/wiki/Bag-of-words_model).
16 | The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB) and
17 | the calculated global feature value frequencies are written to`./docfreq.asdf`. We use three
18 | extractors: literals, identifiers and deterministic AST subpaths of size 4. We double the importance
19 | of the latter features and throw away any values which appear in less than 4 files. Only Java source
20 | code is analysed. We optimize the pipeline executing by using the disk cache to save
21 | the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html) between each pass. The extracted bags
22 | are additionally saved in the database. 
23 | 
24 | ```
25 | apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf \
26 |     -f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \
27 |     -l Java Python --persist DISK_ONLY
28 | ```
29 | 
30 | > Docker users should add `--bblfsh bblfshd --cassandra cassandra`.
31 | 
32 | More about [`bags`](cmd/bags.md).
33 | 
34 | ### Hash the samples
35 | 
36 | We hash the files which were converted into bags in the previous step and stored as several
37 | `./bow*.asdf` files. The hashing parameters are written to `./params.asdf`.
38 | The Weighted Jaccard Similarity threshold, which is the closer to 1 the less files are considered
39 | similar, equals `0.8`. The hashtables are written to the database.
40 | 
41 | ```
42 | apollo hash 'bow*.asdf' -p params.asdf -t 0.8
43 | ```
44 | 
45 | > This step requires an NVIDIA GPU.
46 | 
47 | > Docker users should add `--cassandra=cassandra`.
48 | 
49 | More about [`hash`](cmd/hash.md).
50 | 
51 | ### Query for a file
52 | 
53 | Given a Git hash of a file in the dataset, list the similar files:
54 | ```
55 | apollo query -i <git hash>
56 | ```
57 | 
58 | > Docker users should add `--cassandra cassandra`.
59 | 
60 | More about [`query`](cmd/hash.md).
61 | 
62 | ### Find groups of similar files
63 | 
64 | Find connected components in the resulting similarity graph and write them to `./cc.asdf`.
65 | 
66 | ```
67 | apollo cc -o cc.asdf
68 | ```
69 | 
70 | > Docker users should add `--cassandra cassandra`.
71 | 
72 | Run the default community detection algorithm and write the clusters to `./communities.asdf`.
73 | 
74 | ```
75 | apollo cmd -i cc.asdf -o communities.asdf
76 | ```
77 | 
78 | > Docker users should add `--cassandra cassandra`.
79 | 
80 | Output the report to stdout.
81 | 
82 | ```
83 | apollo dumpcmd communities.asdf
84 | ```
85 | 
86 | > Docker users should add `--cassandra cassandra`.
87 | 
88 | More about: [`cc`](cmd/cc.md), [`cmd`](cmd/cmd.md), [`dumpcmd`](cmd/dumpcmd.md).
89 | 


--------------------------------------------------------------------------------
/doc/GLOSSARY.md:
--------------------------------------------------------------------------------
 1 | ## Model
 2 | A model is the artifact from running an analysis pipeline.
 3 | It is plain data with some methods to access it.
 4 | A model can be serialized to bytes and deserialized from bytes.
 5 | The underlying storage format is specific to [src-d/modelforge](https://github.com/src-d/modelforge)
 6 | and is currently [ASDF](https://github.com/spacetelescope/asdf)
 7 | with [lz4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression.
 8 | 
 9 | ## Pipeline
10 | A tree of linked `sourced.ml.transformers.Transformer` objects which can be executed on PySpark/source{d} engine.
11 | The result is often written on disk as [Parquet](https://parquet.apache.org/) or model files
12 | or to a database.
13 | 
14 | ## Feature
15 | A property of the source code sample.
16 | 
17 | ## Weighted MinHash
18 | An algorithm to approximate the [Weighted Jaccard Similarity](https://en.wikipedia.org/wiki/Jaccard_index#Generalized_Jaccard_similarity_and_distance)
19 | between all the pairs of source code samples in linear time and space. Described by
20 | [Sergey Ioffe](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf).


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Apollo is a research project started by [source{d}](https://sourced.tech) to find duplicated
 4 | source code at scale. It is written in Python3 and relies on [source{d} engine](https://engine.sourced.tech)
 5 | to process "big" source code.
 6 | 
 7 | Big source code warehouses like GitHub inevitably contain much duplication. Snippets, files or even
 8 | projects may have very few differences. Apollo allows to accurately mine those groups of similar
 9 | items. Subsequently, a report may be generated to point at refactoring possibilities.
10 | While Apollo can applied at small scale, e.g. within a single project, it does not try to replace
11 | any existing tools in that niche.
12 | 
13 | Behind the scenes, all source code samples are hashed with an algorithm which is
14 | [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing)-friendly:
15 | similar samples have similar, almost the same hashes. See [the detailed explanation](algorithm.md).
16 | The similarity is a subjective thing and depends on the human opinion. Apollo allows to combine
17 | various feature extractors and optimize their weights together with the overall threshold
18 | according to the reference dataset. The reference dataset is the only source of ground truth
19 | and should be manually labelled by a user, however, Apollo supplies the sane defaults.
20 | 
21 | Apollo is a research project aimed at maximum hackability, flexibility and rapid improvements.
22 | [Gemini](https://github.com/src-d/gemini) is its brother for the actual production usage.
23 | [See more.](gemini.md)


--------------------------------------------------------------------------------
/doc/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Apollo
 2 | 
 3 | * [Introduction](README.md)
 4 |   * [Apollo vs. Gemini](gemini.md)
 5 | * [Installation](install/README.md)
 6 |   * [Pip](install/pip.md)
 7 |   * [Docker](install/docker.md)
 8 |   * [Database initialization](install/db.md)
 9 | * [Walkthrough](101.md)
10 | * Commands reference
11 |   * [resetdb](cmd/resetdb.md)
12 |   * [preprocess](cmd/preprocess.md)
13 |   * [bags](cmd/bags.md)
14 |   * [hash](cmd/hash.md)
15 |   * [query](cmd/query.md)
16 |   * [cc](cmd/cc.md)
17 |   * [dumpcc](cmd/dumpcc.md)
18 |   * [cmd](cmd/cmd.md)
19 |   * [dumpcmd](cmd/dumpcmd.md)
20 |   * [evalcc](cmd/evalcc.md)
21 |  * Models reference
22 |   * [Weighted MinHash parameters](model/wmh.md)
23 |   * [Connected components](model/cc.md)
24 |   * [Communities](model/cmd.md)
25 | 


--------------------------------------------------------------------------------
/doc/algorithm.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/apollo/fcdf67bb579681bbf978168e909cd74207ed06db/doc/algorithm.md


--------------------------------------------------------------------------------
/doc/cmd/bags.md:
--------------------------------------------------------------------------------
 1 | # Bags command
 2 | 
 3 | This command converts input repositories to unordered weighted bags of features that are stored in DB, writes MinHashCuda batches, and writes the Ordered Documents Frequency model as well as the optional Quantization Levels model. You can specify the following arguments:
 4 | 
 5 | - `-r`/`--repositories` : Path to the input files
 6 | - `--parquet`: If your input files are Parquet files
 7 | - `--graph`: Path to the output Graphviz file, if you wish to keep the tree
 8 | - `-l`/`--languages` : Languages to keep, defaults to all languages detected by Babelfish
 9 | - `--dzhigurda`: Index of the last commit to keep, defaults to 0 (only the head), 1 is HEAD~2, etc
10 | - `--bow`: Path to the output batches
11 | - `--batch`: The maximum size of a single batch in bytes
12 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1
13 | - `--docfreq-in`: Path to a precomputed Ordered Document Frequency model
14 | - `--docfreq-out`: Path to the output Ordered Document Frequency model (can not be used with `docfreq-in`)
15 | - `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million
16 | - `--cached-index-path`: Path to a precomputed Document Frequency model storing an index of the documents to be extracted
17 | - `--partitions`: to repartition data, this will specify new number of partitions 
18 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 
19 | - [Feature arguments](features.md)
20 | - [Spark and Engine arguments](https://github.com/src-d/ml/blob/master/doc/spark.md)
21 | - [Cassandra/Scylla arguments](db.md)
22 | 


--------------------------------------------------------------------------------
/doc/cmd/cc.md:
--------------------------------------------------------------------------------
1 | # CC command
2 | 
3 | This command runs the connected components analysis on previously created hash tables,
4 | and saves the CCs in [this `Model`](/doc/model/cc.md). You can specify the following arguments:
5 | 
6 | - `o`/`--output`: Path to the output Connected Components model
7 | - [Cassandra/Scylla arguments](db.md)
8 | 
9 | 


--------------------------------------------------------------------------------
/doc/cmd/cmd.md:
--------------------------------------------------------------------------------
 1 | # CMD command
 2 | 
 3 | __Currently does not work in Spark Cluster mode.__
 4 | 
 5 | This command runs the community detection on a previously created Connected Components 
 6 | model, and saves them in CCs in [this `Model`](/doc/model/cmd.md). You can specify 
 7 | the following arguments:
 8 | 
 9 | - `-i`/`--input`: Path to the input Connected Components model
10 | - `-o`/`--output`: Path to the output Community Detection model
11 | - `--edges`: Specific method to be used to generate edges, quadratic will connect each item in each bucket to all other items, while the  default, linear, will create for each bucket an artificial vertex to which all items will be connected. Depending on the method the edges will be created in quadratic or linear time, relatively to the number of buckets
12 | - `--no-spark`: If you do not want to use Spark - *but who would want that ?*
13 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) if you choose to use it
14 | - `-a`/`--algorithm`: Community detection algorithm to apply, defaults to `walktrap`, check out the [igraph](http://igraph.org/c/doc/igraph-Community.html) doc to learn more. See below for the full list of available algorithms and their parameters as of today (see code in `igraph/__init__.py` for description of parameters), we excluded parameters that modify the returned object
15 | - `-p`/`--params`: Depending on the algorithm, you may need to specify parameters (JSON format)
16 | 
17 | | Algorithm  | Parameters |
18 | |:----------:|:----------:|
19 | |community_fastgreedy|weights (for edges)|
20 | |community_infomap|edge_weights, vertex_weights, trials|
21 | |community_leading_eigenvector_naive|clusters|
22 | |community_leading_eigenvector|clusters, weights (for edges), arpack_options|
23 | |community_label_propagation|weights (for edges), initial, fixed|
24 | |community_multilevel|weights (for edges)|
25 | |community_optimal_modularity|weights (for edges)|
26 | |community_edge_betweenness|weights (for edges), clusters, directed|
27 | |community_spinglass|weights (for edges), spins, parupdate, start_temp, stop_temp, cool_fact, update_rule, gamma, _lambda, implementation|
28 | |community_walktrap|weights (for edges), steps|
29 | 


--------------------------------------------------------------------------------
/doc/cmd/db.md:
--------------------------------------------------------------------------------
1 | # Cassandra/Scylla arguments
2 | 
3 | For all of the commands that require access to the database, you can specify the following arguments:
4 | 
5 | - `--cassandra`: Specific address of your Scylla/Cassandra DB, if you are not running it locally (the format is `<address>:<port>`, if you are using the default 9042 port then no need to specify it, e.g. `--cassandra scylla`)
6 | - `--keyspace`: Specific name of the Cassandra key space, defaults to `apollo`
7 | - `--tables`: Specific table mapping, use JSON format to modify it, e.g. `--tables {"bags": "bags_2", "hashes": "hashes_2", "hashtables": "hashtables_2", "hashtables2": "hashtables2_2"}`
8 | 


--------------------------------------------------------------------------------
/doc/cmd/dumpcc.md:
--------------------------------------------------------------------------------
1 | # DumpCC command
2 | 
3 | This command outputs a report on the given Connected Components model to stdout, you must specify the following argument:
4 | 
5 | - `-i`/`--input`: Path to the input Connected Components model
6 | 


--------------------------------------------------------------------------------
/doc/cmd/dumpcmd.md:
--------------------------------------------------------------------------------
1 | # DumpCMD command
2 | 
3 | This command outputs a report on the given Community Detection model to stdout, you can specify the following arguments:
4 | 
5 | - `--template`: Path to the `report.md.jinja2` file
6 | - `--batch`: Same as in `query` the number of hashes to query simultaneously, defaults to 100
7 | - `-i`/`--input`: Path to the input Community Detection model
8 | - [Cassandra/Scylla arguments](db.md)
9 | 


--------------------------------------------------------------------------------
/doc/cmd/evalcc.md:
--------------------------------------------------------------------------------
 1 | # EvalCC command
 2 | 
 3 | __Currently does not work in Spark Cluster mode.__
 4 | 
 5 | This command calculates the precise similarity and fitness metrics for the given Community Detection model, you can specify the following arguments:
 6 | 
 7 | - `-i`/`--input`: Path to the input Community Detection model;
 8 | - `-t`/`--threshold`: Jacquard Similarity threshold (float in [0,1]) over which we consider there is similarity - to calculate number of misses
 9 | - [Cassandra/Scylla arguments](db.md)
10 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md)
11 | 
12 | __Note:__
13 | 
14 | To run this command it is advised to set the Spark parameter `spark.default.parallelism` to a
15 | higher value then the default 200 partitions if you are running on large amounts the data.
16 | 


--------------------------------------------------------------------------------
/doc/cmd/features.md:
--------------------------------------------------------------------------------
 1 | # Feature arguments
 2 | 
 3 | For all of the commands that extract features, you must specify the following arguments:
 4 | 
 5 | - `-x`/`--mode`: Mode to select for analysis, defaults to `file`, can also be `repo` or `func`
 6 | - `--quant`: Path to the input or output Quantization Levels model (optional, only supported for
 7 | the `children` extractor)
 8 | - `-f`/`--feature`: Features to extract from each item, at the moment among the ones below
 9 | 
10 | 
11 | | Feature  | Description                        |
12 | |----------|:---------------------------------:|
13 | | graphlet | Converts the UAST to a weighted bag of graphlets, a graphlet of a UAST node is composed from the node itself, its parent and its children|
14 | | lit      | Converts the UAST to a weighted bag of literals (UAST node role)
15 | | id       | Converts the UAST to a weighted bag of identifiers (UAST node role)
16 | | children | Converts the UAST to a bag of (internal type, quantized number of children) pairs, see [quantization](https://en.wikipedia.org/wiki/Quantization_(signal_processing)) for more info |  
17 | | uast2seq | Converts the UAST to a bag of sequences of nodes, we use Depth First Search for the traversal of the UAST | 
18 | | node2vec | Converts the UAST to a bag of vectorized sequences produced through a random walk
19 | 
20 | You can check out the [Babelfish documentation](https://doc.bblf.sh/) for more information about UASTs. The weights of each feature in a bag are always computed from the observed frequencies. 
21 | 
22 | - `--<feature>-<arg>`: For each of the above features you can also specify arguments:
23 | 
24 | | Feature  | Flag                              | Default | Description |
25 | |----------|:---------------------------------:|:-------:|:------------:|
26 | | graphlet | --graphlet-weight                 | 1       | Weight of this feature relative to the others (used by TF-IDF) |
27 | | lit      | --lit-weight                      | 1       | Weight of this feature relative to the others (used by TF-IDF) |
28 | | id       | --id-split-stem                   | False   | Whether to split identifiers and consider each part to be a separate one, or not |
29 | | id       | --id-weight                       | 1       | Weight of this feature relative to the others (used by TF-IDF) |
30 | | children | --children-npartitions            | 10      | Number of partitions on which we apply quantization |
31 | | uast2seq | --uast2seq-weight                 | 1       | Weight of this feature relative to the others (used by TF-IDF) |
32 | | uast2seq | --uast2seq-seq-len                | 5       | Length(s) of sequences, can be a list |
33 | | uast2seq | --uast2seq-stride                 | 1       | Stride used to iterate through the sequenced UAST to extract subsequences of chosen length |
34 | | node2vec | --node2vec-weight                 | 1       | Weight of this feature relative to the others (used by TF-IDF)
35 | | node2vec | --node2vec-seq-len                | (5, 6)  | Length(s) of sequences to be vectorized, can be a list
36 | | node2vec | --node2vec-p-explore-neighborhood | 0.5     | Likelihood of immediately revisiting a node in the walk (*return parameter*)|
37 | | node2vec | --node2vec-stride                 | 1       | Strides used to iterate through the walk sequences to extract subsequences of chosen length |
38 | | node2vec | --node2vec-seed                   | 42      | Seed to use to generate the random walk |
39 | | node2vec | --node2vec-q-leave-neighborhood   | 0.5     | Modulates the ability to differentiate between inward and outward nodes (*in out parameter*) |
40 | | node2vec | --node2vec-n-walks                | 5       | Number of walks from each node. |
41 | | node2vec | --node2vec-n-steps                | 19      | Number of steps in each walk. |


--------------------------------------------------------------------------------
/doc/cmd/hash.md:
--------------------------------------------------------------------------------
 1 | # Hash command
 2 | 
 3 | __Currently does not work in Spark Cluster mode.__
 4 | 
 5 | This command applies the MinHashCUDA algorithm on previously written batches, 
 6 | stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters
 7 | in [this `Model`](/doc/model/wmh.md). You can specify the following arguments:
 8 | 
 9 | - `-i`/`--input`: Path to the input batch(es)
10 | - `--seed`: Specific random generator (useful for cross execution comparisons), default to a random number depending of the time
11 | - `--mhc-verbosity`: MinHashCuda log level, specify 0 for silence or 2 for full logs, 1 is the default and just shows progress
12 | - `--devices`: Index of NVIDIA device to use, defaults to 0 (all available)
13 | - `--docfreq`: Path to the input Ordered Document Frequency model
14 | - `--size`: Hash size, defaults to 128
15 | - `--partitions`: to repartition data, this will specify new number of partitions 
16 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 
17 | - [Cassandra/Scylla arguments](db.md)
18 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md)
19 | 
20 | You must also specify WMH arguments:
21 | 
22 | - `-p`/`--params`: Path to the output WMH parameters
23 | - `-t`/`--threshold`: Jacquard Similarity threshold (float in [0,1]) over which we consider there is similarity
24 | - `--false-positive-weight`: Parameter that adjusts the relative importance of minimizing false positives count when optimizing for the Jacquard similarity threshold, default to .5
25 | - `--false-negative-weight`: Same for false negatives
26 | 


--------------------------------------------------------------------------------
/doc/cmd/preprocess.md:
--------------------------------------------------------------------------------
 1 | # Preprocess command
 2 | 
 3 | This command computes the index and Ordered Document Frequency model for the input repositories, 
 4 | and optionally the Quantization Levels model if selected features support it. Currently running the
 5 | bags command on large inputs can result in failures, this allows you to create all the necessary
 6 | data to run on subsets of your repositories. As you will be applying TF-IDF, be aware that your 
 7 | subsets must be disjoint, i.e. if you are running in `repo` mode then repos **must not** be spread 
 8 | out in different subsets, or there will be duplicate features. You can specify the following 
 9 | arguments:
10 | 
11 | - `-r`/`--repositories` : Path to the input files
12 | - `--parquet`: If your input files are Parquet files
13 | - `--graph`: Path to the output Graphviz file, if you wish to keep the tree
14 | - `-l`/`--languages` : Languages to keep, defaults to all languages detected by Babelfish
15 | - `--dzhigurda`: Index of the last commit to keep, defaults to 0 (only the head), 1 is HEAD~2, etc
16 | - `--bow`: Path to the output batches
17 | - `--batch`: The maximum size of a single batch in bytes
18 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1
19 | - `--docfreq-out`: Path to the output Ordered Document Frequency model
20 | - `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million
21 | - `--cached-index-path`: Path to the output Document Frequency model storing the index of all documents
22 | - `--partitions`: to repartition data, this will specify new number of partitions 
23 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 
24 | - [Feature arguments](features.md)
25 | - [Spark and Engine arguments](https://github.com/src-d/ml/blob/master/doc/spark.md)
26 | 


--------------------------------------------------------------------------------
/doc/cmd/query.md:
--------------------------------------------------------------------------------
 1 | # Query command
 2 | 
 3 | This command finds items similar to the one specified, and outputs them using the `query.md.jinja2` report file. There are two query modes, that are mutually exclusive. For both of them you can specify the following arguments:
 4 | 
 5 | - `--precise`: Whether to calculate the precise set or not
 6 | - `--template`: Path to `query.md.jinja2`
 7 | - `--batch`: Number of hashes to query simultaneously, defaults to 100
 8 | - [Cassandra/Scylla arguments](db.md)
 9 | 
10 | **Id mode:**
11 | 
12 | In this mode, the file is already in the databases, it's features have been extracted. You only need to specify which file you wish to pick with:
13 | 
14 | - `-i`/`--id`: SHA1 identifier of the file.
15 | 
16 | **File mode:**
17 | 
18 | In this mode, the file is not in the database, so additionally we have to extract the bag of features from that file and apply the MinHashCUDA algorithm on them. You must specify the following arguments:
19 | 
20 | - `-c`/`--file`: Absolute path of the file
21 | - `--bblfsh`: Same as in [engine arguments]((https://github.com/src-d/ml/blob/master/doc/spark.md))
22 | - `--docfreq`: Path to the input Ordered Document Frequency model created while running the `bags`command (optional)
23 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1
24 | - [Feature arguments](bags.md) also used by the `bags` command
25 | - [WMH arguments](hash.md) also used by the `hash` command
26 | 


--------------------------------------------------------------------------------
/doc/cmd/resetdb.md:
--------------------------------------------------------------------------------
1 | # Resetdb command
2 | 
3 | This command destructively resets the database, you can specify the following arguments:
4 | 
5 | - `--hashes-only`: To clear only the hash tables
6 | - [Cassandra/Scylla arguments](db.md)
7 | 


--------------------------------------------------------------------------------
/doc/gemini.md:
--------------------------------------------------------------------------------
 1 | # Apollo vs. Gemini
 2 | 
 3 | [Gemini](https://github.com/src-d/gemini) is mainly written in Scala and targets the production and
 4 | bloody enterprise. Thus it is relatively less flexible but should be high performing and efficient.
 5 | Apollo is a polygon for innovation which feeds all its research goodies to Gemini. Regarding the
 6 | scalability, both are scalable and can process big amounts of data.
 7 | 
 8 | Besides, Apollo is owned by Machine Learning team and Gemini is owned by the Applications team.
 9 | 
10 | ### Which one to choose?
11 | 
12 | If your goal is doing research and trying new ideas, stick with Apollo. If you want to deduplicate
13 | terabytes of sources in your organization, go with Gemini.


--------------------------------------------------------------------------------
/doc/install/README.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Apollo can be installed in two ways:
 4 | 
 5 | * Through [`pip install`](pip.md).
 6 | * As a [docker container](docker.md).
 7 | 
 8 | New users are recommended to use the container image so that they do not have to setup the complex
 9 | environment.
10 | 
11 | It is required to [initialize the database](db.md) once Apollo is installed.
12 | 


--------------------------------------------------------------------------------
/doc/install/db.md:
--------------------------------------------------------------------------------
1 | # Database initialization
2 | 
3 | Run the following command:
4 | 
5 | ```
6 | apollo resetdb
7 | ```
8 | 
9 | [More about `resetdb`](../cmd/resetdb.md).


--------------------------------------------------------------------------------
/doc/install/docker.md:
--------------------------------------------------------------------------------
 1 | # Docker image installation
 2 | 
 3 | ### Requirements
 4 | 
 5 | * NVIDIA GPU
 6 | 
 7 | Needed to be installed:
 8 | 
 9 | * [Babelfish](https://doc.bblf.sh/user/getting-started.html) as `bblfshd`.
10 | * [Cassandra](https://hub.docker.com/r/library/cassandra/) or [ScyllaDB](https://hub.docker.com/r/scylladb/scylla/) as `cassandra`.
11 | 
12 | ### Magic command
13 | 
14 | ```
15 | docker run --rm -it srcd/apollo --help
16 | ```
17 | 
18 | We will imply the following command by `apollo` throughout the examples:
19 | 
20 | ```
21 | docker run -it --rm -v /path/to/io:/io -w /io --privileged --link bblfshd --link cassandra srcd/apollo
22 | ```
23 | 
24 | `--privileged` is needed to access the NVIDIA devices inside the container without the pain of
25 | manually specifying them, can be replaced with `--device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0` +
26 | other `/dev/nvidia*` if you've got multiple cards.


--------------------------------------------------------------------------------
/doc/install/pip.md:
--------------------------------------------------------------------------------
 1 | # Pip installation
 2 | 
 3 | ### Requirements
 4 | 
 5 | * Python 3.4+
 6 | * Linux or macOS. **Windows will not work.**
 7 | * NVIDIA GPU
 8 | 
 9 | Needed to be installed:
10 | 
11 | * [source{d} engine](https://github.com/src-d/engine) with all of the dependencies such as Babelfish
12 | * [libMHCUDA](https://github.com/src-d/minhashcuda) Python package
13 | * [sourced.ml](https://github.com/src-d/ml) @ `develop` branch
14 | * [Cassandra](http://cassandra.apache.org/) or [ScyllaDB](http://www.scylladb.com/)
15 | 
16 | ### Magic command
17 | 
18 | ```
19 | pip3 install git+https://github.com/src-d/apollo
20 | ```
21 | 
22 | It should run without any errors.
23 | 
24 | ### Testing
25 | 
26 | ```
27 | apollo --help
28 | ```


--------------------------------------------------------------------------------
/doc/model/cc.md:
--------------------------------------------------------------------------------
 1 | # Connected Components Model
 2 | 
 3 | This model stores the connected components found in the pairwise similarity
 4 | graph after hashing by the `cc` command. 
 5 | 
 6 | **A quick reminder**
 7 | 
 8 | A document hashes to as many buckets as there are hashtables, which means if there are 
 9 | 3 hashtables, then a document hashes to 3 buckets. The number of hashtables increases 
10 | as the similarity threshold decreases. Any two documents that hash to at least one bucket 
11 | in common are in the same component.
12 | 
13 | The model has the following parameters:
14 | 
15 | - `cc.id_to_cc`: a numpy array of integers of the size of the number of documents, where
16 | document `i` is in the community number `cc.id_to_cc[i]`;
17 | - `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
18 | mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
19 | is file `i`'s filename;
20 | - `cc.id_to_buckets`: a Scipy sparse CSR matrix of the shape `number of documents` 
21 | x `number of buckets`, where the element in row `i` and column `j` is equal to 1 if
22 | document `i` hashes to buck `j`, and 0 if not.
23 | 
24 | Example:
25 | 
26 | ```
27 | from apollo.graph import ConnectedComponentsModel
28 | 
29 | cc = ConnectedComponentsModel().load("cc.asdf")
30 | print(cc.dump())  # prints the number of CCs and documents
31 | ```


--------------------------------------------------------------------------------
/doc/model/cmd.md:
--------------------------------------------------------------------------------
 1 | # Communities Model
 2 | 
 3 | This model stores the communities detected by the `cmd` command from a previously
 4 | created Connected Component model. It's contents heavily depends on the algorithm 
 5 | chosen (and it's parameters), but more importantly by the edge creation method, 
 6 | as is described in [the doc](/doc/cmd/cmd.md). Indeed, if the default linear method 
 7 | is chosen, then the communities will not only consist of documents, but also
 8 | of **buckets**, as they will have been added to the CC graphs as artificial vertices. 
 9 | This means that, in this case, some communities may consist *only* of buckets.
10 | 
11 | The model has the following parameters:
12 | 
13 | - `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
14 | mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
15 | is file `i`'s filename;
16 | - `cc.communities`: a list of lists of integers, where each integer in `cc.communities[i]`
17 | is in the `i`th community. If an element `e` in a community is an integer smaller 
18 | then the length of the `cc.id_to_elements` dictionary, then it's a document. If not, 
19 | it is the bucket number `e - len(cc.id_to_elements)` in the Connected Components 
20 | model's `id_to_buckets` parameter which has been used as input.
21 | 
22 | The model also has this method:
23 | - `cc.count_elements`: it counts the number of distinct documents in the communities
24 | (not all documents in the dictionary may be in a community, as we don't care for 
25 | communities of one). Buckets are not counted by this method. 
26 | 
27 | Example:
28 | 
29 | ```
30 | from apollo.graph import CommunitiesModel
31 | 
32 | cmd = CommunitiesModel().load("cc.asdf")
33 | print(cmd.dump())  # prints the number of communities (even if containing only buckets)
34 | print("Number of distinct documents: %s" % (cmd.count_elements()))
35 | ```


--------------------------------------------------------------------------------
/doc/model/wmh.md:
--------------------------------------------------------------------------------
 1 | # Weighted MinHash Parameters Model
 2 | 
 3 | This model stores the parameters generated by `libMHCUDA`'s  `minhash_cuda_retrieve_vars`
 4 | function, when running the `hash` command. Named like in Sergey Ioffe's paper, 
 5 | the parameters are:
 6 | 
 7 | - `wmh.rs`: the quantization granularity;
 8 | - `wmh.ln_cs` : the logarithm of the Cauchy variates;
 9 | - `whh.betas`: the random offset.
10 | 
11 | All 3 are Numpy arrays of the shape: `hash size` x `number of features`. If you have
12 | the wish, or need, to use the `hash` command multiple times, you should reuse this
13 | model each time, or the result will not be accurate, as the parameters will be 
14 | regenerated at random.
15 | 
16 | Example:
17 | 
18 | ```
19 | from apollo.hasher import WeightedMinHashParameters
20 | 
21 | wmh = WeightedMinHashParameters().load("params.asdf")
22 | print(wmh.dump())  # prints the shape of matrices
23 | ```


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   apollo:
 4 |     build: .
 5 |     image: srcd/apollo
 6 |     privileged: true
 7 |     stdin_open: true
 8 |     tty: true
 9 |     links:
10 |       - bblfshd
11 |       - scylla
12 |   bblfshd:
13 |     image: bblfsh/bblfshd
14 |     privileged: true
15 |     volumes:
16 |       - /var/lib/bblfshd
17 |   scylla:
18 |     image: scylladb/scylla
19 |     command: /docker-entrypoint.py --developer-mode=1
20 |     volumes:
21 |       - /var/lib/scylla


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cassandra_driver==3.14.0
2 | libMHCUDA==2.1.0
3 | python-igraph==0.7.1.post6
4 | jinja2==2.10
5 | sourced-ml==0.6.0
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | from os import path
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name="apollo",
 7 |     description="source{d} Gemini's evil twin which runs everything using Python.",
 8 |     version="0.1.0",
 9 |     license="Apache 2.0",
10 |     author="source{d}",
11 |     author_email="machine-learning@sourced.tech",
12 |     url="https://github.com/src-d/apollo",
13 |     download_url="https://github.com/src-d/apollo",
14 |     packages=find_packages(exclude=("apollo.tests",)),
15 |     entry_points={
16 |         "console_scripts": ["apollo=apollo.__main__:main"],
17 |     },
18 |     keywords=["machine learning on source code", "weighted minhash", "minhash",
19 |               "bblfsh", "babelfish"],
20 |     install_requires=["cassandra_driver >= 3.12.0, <4.0",
21 |                       "libMHCUDA >= 2.0, <3.0",
22 |                       "jinja2 >=2.0, <3.0",
23 |                       "python-igraph >= 0.7, <2.0",
24 |                       "sourced-ml >= 0.6.0, <0.7"],
25 |     package_data={"": ["LICENSE", "README.md"] + glob(path.join("apollo", "*.jinja2"))},
26 |     classifiers=[
27 |         "Development Status :: 3 - Alpha",
28 |         "Environment :: Console",
29 |         "Intended Audience :: Developers",
30 |         "License :: OSI Approved :: Apache Software License",
31 |         "Operating System :: POSIX",
32 |         "Programming Language :: Python :: 3.4",
33 |         "Programming Language :: Python :: 3.5",
34 |         "Programming Language :: Python :: 3.6",
35 |         "Topic :: Software Development :: Libraries"
36 |     ]
37 | )
38 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from modelforge.logs import setup_logging
 4 | 
 5 | 
 6 | utmain = sys.modules['__main__']
 7 | if utmain.__package__ == "unittest" and utmain.__spec__ is None:
 8 |     from collections import namedtuple
 9 |     ModuleSpec = namedtuple("ModuleSpec", ["name"])
10 |     utmain.__spec__ = ModuleSpec("unittest.__main__")
11 |     del ModuleSpec
12 | del utmain
13 | 
14 | 
15 | def setup():
16 |     setup_logging("INFO")
17 | 


--------------------------------------------------------------------------------
/tests/test_graph_CommunityDetector.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import unittest
 3 | 
 4 | from igraph import Graph
 5 | 
 6 | from apollo.graph import CommunityDetector
 7 | 
 8 | 
 9 | class CommunityDetectorTest(unittest.TestCase):
10 |     def setUp(self):
11 |         edges = [(0, 1)]
12 |         weights = [1]
13 |         nvertices = 2
14 |         self.graph = Graph(n=nvertices, edges=edges, directed=False)
15 |         self.graph.edge_weights = weights
16 | 
17 | 
18 | def test_generator(algorithm):
19 |     def test_community_detection(self):
20 |         cmd = CommunityDetector(algorithm=algorithm, config={})
21 |         res = cmd(self.graph)
22 |         self.assertEqual(len(set(itertools.chain(*res))), 2)  # Check number of unique vertices
23 | 
24 |     return test_community_detection
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     algorithms = ["spinglass", "optimal_modularity", "multilevel", "label_propagation",
29 |                   "leading_eigenvector", "leading_eigenvector", "infomap", "walktrap",
30 |                   "fastgreedy"]
31 |     for algorithm in algorithms:
32 |         test_name = "test_community_detection_%s" % algorithm
33 |         test = test_generator(algorithm)
34 |         setattr(CommunityDetectorTest, test_name, test)
35 |     print([method for method in dir(CommunityDetectorTest)
36 |            if "test_community_detection_" in method])
37 |     unittest.main()
38 | 


--------------------------------------------------------------------------------
/tests/test_graph_ConnectedComponents.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import itertools
 3 | import unittest
 4 | 
 5 | from apollo.graph import _find_connected_component
 6 | 
 7 | 
 8 | class ConnectedComponentsTest(unittest.TestCase):
 9 |     def test_empty_connected_component(self):
10 |         buckets = []
11 |         element_to_buckets = defaultdict(set)
12 | 
13 |         res = _find_connected_component(buckets, element_to_buckets)
14 |         self.assertEqual(0, len(res))
15 |         self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values())))
16 | 
17 |     def test_one_connected_component(self):
18 |         buckets = []
19 |         element_to_buckets = defaultdict(set)
20 | 
21 |         # Create one connected component
22 |         for _ in range(5):
23 |             bucket_id = len(buckets)
24 |             buckets.append([bucket_id, bucket_id + 1])
25 |             element_to_buckets[bucket_id].add(bucket_id)
26 |             element_to_buckets[bucket_id + 1].add(bucket_id)
27 |         res = _find_connected_component(buckets, element_to_buckets)
28 |         self.assertEqual(1, len(res))
29 |         self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values())))
30 | 
31 |     def test_two_connected_components(self):
32 |         buckets = []
33 |         element_to_buckets = defaultdict(set)
34 | 
35 |         # Create one connected component
36 |         for _ in range(5):
37 |             bucket_id = len(buckets)
38 |             buckets.append([bucket_id, bucket_id + 1])
39 |             element_to_buckets[bucket_id].add(bucket_id)
40 |             element_to_buckets[bucket_id + 1].add(bucket_id)
41 | 
42 |         bucket_id = len(buckets)
43 |         buckets.append([bucket_id])
44 |         element_to_buckets[bucket_id].add(bucket_id)
45 | 
46 |         # Create another connected component
47 |         for _ in range(5):
48 |             bucket_id = len(buckets)
49 |             buckets.append([bucket_id, bucket_id + 1])
50 |             element_to_buckets[bucket_id].add(bucket_id)
51 |             element_to_buckets[bucket_id + 1].add(bucket_id)
52 | 
53 |         res = _find_connected_component(buckets, element_to_buckets)
54 |         self.assertEqual(2, len(res))
55 |         self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values())))
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/tests/test_modify_feature_weights.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from copy import deepcopy
  3 | import os
  4 | import unittest
  5 | from unittest.mock import patch
  6 | import tempfile
  7 | 
  8 | import numpy
  9 | from sourced.ml.models import OrderedDocumentFrequencies
 10 | # from sourced.ml.transformers import BagsBatch
 11 | import sourced
 12 | 
 13 | 
 14 | from apollo.hasher import modify_feature_weights
 15 | 
 16 | 
 17 | class DummyClass:
 18 |     pass
 19 | 
 20 | 
 21 | def dict_to_arguments(d):
 22 |     res = DummyClass()
 23 | 
 24 |     for key in d:
 25 |         setattr(res, key, d[key])
 26 | 
 27 |     return res
 28 | 
 29 | 
 30 | @unittest.skip("Skipping test until TODO is done in hasher:177 ")
 31 | class FeatureWeightTest(unittest.TestCase):
 32 |     FakeExtractor = namedtuple("FakeExtractor", ("NAME", "NAMESPACE"))
 33 | 
 34 |     def setUp(self):
 35 |         docs = 1
 36 |         freq = 1
 37 |         default_weight = 1
 38 |         docfreqs = []
 39 |         self.extractors = {}
 40 |         self.extractor_args = {}
 41 |         for i in range(2):
 42 |             namespace = "extractor%s." % i
 43 |             feat_freq = {}
 44 |             for j in range(2):
 45 |                 feat_freq[namespace + str(j)] = freq
 46 |             docfreqs.append(feat_freq)
 47 | 
 48 |             self.extractors[namespace] = self.FakeExtractor(NAME=namespace, NAMESPACE=namespace)
 49 |             self.extractor_args["%s_weight" % namespace] = default_weight
 50 | 
 51 |         # Create tmp file and save OrderedDocumentFrequencies there
 52 |         self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False)
 53 |         model = OrderedDocumentFrequencies().construct(docs, docfreqs)
 54 |         model.save(self.tmp_file.name)
 55 | 
 56 |         # arguments.docfreq
 57 |         self.docfreq_args = {"docfreq": self.tmp_file.name}
 58 | 
 59 |         # batches
 60 |         self.batches = []  # [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))]
 61 | 
 62 |     def tearDown(self):
 63 |         self.tmp_file.close()
 64 |         try:
 65 |             os.remove(self.tmp_file.name)
 66 |         except OSError:
 67 |             pass
 68 | 
 69 |     def test_empty_extractors(self):
 70 |         arguments = dict_to_arguments(self.docfreq_args)
 71 |         with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True):
 72 |             result = modify_feature_weights(deepcopy(self.batches), arguments)
 73 |             self.assertEqual(len(result), len(self.batches))
 74 |             for bathc_res, batch_init in zip(result, self.batches):
 75 |                 bathc_res.matrix.sort_indices()
 76 |                 batch_init.matrix.sort_indices()
 77 | 
 78 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indices,
 79 |                                                   batch_init.matrix.indices))
 80 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.data, batch_init.matrix.data))
 81 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr,
 82 |                                                   batch_init.matrix.indptr))
 83 | 
 84 |     def test_extractor_weight_1(self):
 85 |         self.docfreq_args.update(self.extractor_args)
 86 |         arguments = dict_to_arguments(self.docfreq_args)
 87 |         with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True):
 88 |             result = modify_feature_weights(deepcopy(self.batches), arguments)
 89 |             self.assertEqual(len(result), len(self.batches))
 90 |             for bathc_res, batch_init in zip(result, self.batches):
 91 |                 bathc_res.matrix.sort_indices()
 92 |                 batch_init.matrix.sort_indices()
 93 | 
 94 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indices,
 95 |                                                   batch_init.matrix.indices))
 96 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.data, batch_init.matrix.data))
 97 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr,
 98 |                                                   batch_init.matrix.indptr))
 99 | 
100 |     def test_empty_batches(self):
101 |         self.docfreq_args.update(self.extractor_args)
102 |         arguments = dict_to_arguments(self.docfreq_args)
103 |         with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True):
104 |             result = modify_feature_weights([], arguments)
105 |             self.assertEqual(len(result), 0)
106 | 
107 |     def test_no_docfreq(self):
108 |         no_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False)
109 |         no_file.close()
110 |         try:
111 |             os.remove(no_file.name)
112 |         except OSError:
113 |             pass
114 | 
115 |         no_docfreq = {"docfreq": no_file.name}
116 |         no_docfreq.update(self.extractor_args)
117 |         arguments = dict_to_arguments(self.docfreq_args)
118 |         with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True):
119 |             self.assertRaises(Exception, modify_feature_weights(self.batches, arguments))
120 | 
121 |     def test_normal_run(self):
122 |         self.docfreq_args.update(self.extractor_args)
123 |         weight = 2
124 |         for key in self.docfreq_args:
125 |             if "_weight" in key:
126 |                 self.docfreq_args[key] *= weight  # make not 1
127 |         arguments = dict_to_arguments(self.docfreq_args)
128 |         with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True):
129 |             result = modify_feature_weights(deepcopy(self.batches), arguments)
130 |             self.assertEqual(len(result), len(self.batches))
131 |             for bathc_res, batch_init in zip(result, self.batches):
132 |                 bathc_res.matrix.sort_indices()
133 |                 batch_init.matrix.sort_indices()
134 | 
135 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indices,
136 |                                                   batch_init.matrix.indices))
137 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.data,
138 |                                                   batch_init.matrix.data * weight))
139 |                 self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr,
140 |                                                   batch_init.matrix.indptr))
141 |         pass
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     unittest.main()
146 | 


--------------------------------------------------------------------------------