├── .flake8 ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DCO ├── Dockerfile ├── LICENSE.md ├── MAINTAINERS ├── README.md ├── apollo ├── __init__.py ├── __main__.py ├── bags.py ├── cassandra_utils.py ├── graph.py ├── hasher.py ├── query.md.jinja2 ├── query.py ├── report.md.jinja2 └── warmup.py ├── doc ├── 101.md ├── GLOSSARY.md ├── README.md ├── SUMMARY.md ├── algorithm.md ├── cmd │ ├── bags.md │ ├── cc.md │ ├── cmd.md │ ├── db.md │ ├── dumpcc.md │ ├── dumpcmd.md │ ├── evalcc.md │ ├── features.md │ ├── hash.md │ ├── preprocess.md │ ├── query.md │ └── resetdb.md ├── gemini.md ├── install │ ├── README.md │ ├── db.md │ ├── docker.md │ └── pip.md └── model │ ├── cc.md │ ├── cmd.md │ └── wmh.md ├── docker-compose.yml ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── test_graph_CommunityDetector.py ├── test_graph_ConnectedComponents.py └── test_modify_feature_weights.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=99 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _book 2 | bundle 3 | *.asdf 4 | 5 | #Mac OS 6 | *.DS_Store 7 | 8 | #PyCharm IDE 9 | .idea/ 10 | 11 | # Documentation build files 12 | doc/_build/ 13 | doc/ast2vec.rst 14 | doc/modules.rst 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # dotenv 98 | .env 99 | 100 | # virtualenv 101 | .venv 102 | venv/ 103 | ENV/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | services: 4 | - docker 5 | cache: 6 | directories: 7 | - "$HOME/.cache/pip" 8 | addons: 9 | apt: 10 | packages: 11 | - libsnappy-dev 12 | _install: &_install 13 | - gimme 1.8 14 | - source ~/.gimme/envs/latest.env 15 | - pip install --upgrade pip 16 | - pip install codecov 17 | - pip install -e . 18 | _coverage: &_coverage 19 | - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine" 20 | matrix: 21 | include: 22 | - python: 3.4 23 | env: *_coverage 24 | install: *_install 25 | - python: 3.5 26 | env: *_coverage 27 | install: *_install 28 | - python: 3.6 29 | env: SCRIPT="flake8 ." 30 | install: pip install flake8 31 | - python: 3.6 32 | env: *_coverage 33 | install: *_install 34 | after_success: 35 | - codecov 36 | fast_finish: true 37 | before_script: 38 | - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd 39 | - docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver 40 | - docker run --name scylla -p 9042:9042 -d scylladb/scylla --developer-mode=1 41 | script: 42 | - (eval "$SCRIPT") 43 | notifications: 44 | email: false 45 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | education, socio-economic status, nationality, personal appearance, race, 10 | religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at conduct@sourced.tech. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Apollo project is [GPL licensed](LICENSE.md) and accepts 4 | contributions via GitHub pull requests. This document outlines some of the 5 | conventions on development workflow, commit message formatting, contact points, 6 | and other resources to make it easier to get your contribution accepted. 7 | 8 | ## Certificate of Origin 9 | 10 | By contributing to this project you agree to the [Developer Certificate of 11 | Origin (DCO)](DCO). This document was created by the Linux Kernel community and is a 12 | simple statement that you, as a contributor, have the legal right to make the 13 | contribution. 14 | 15 | In order to show your agreement with the DCO you should include at the end of commit message, 16 | the following line: `Signed-off-by: John Doe `, using your real name. 17 | 18 | This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d3aa82f6583b9aadfecc5f8fa1c74aca09/Documentation/git-commit.txt#L154-L161) flag on the `git commit`. 19 | 20 | 21 | ## Support Channels 22 | 23 | The official support channels, for both users and contributors, are: 24 | 25 | - GitHub [issues](https://github.com/src-d/apollo/issues)* 26 | - Slack: #machine-learning room in the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM) 27 | 28 | *Before opening a new issue or submitting a new pull request, it's helpful to 29 | search the project - it's likely that another user has already reported the 30 | issue you're facing, or it's a known issue that we're already aware of. 31 | 32 | 33 | ## How to Contribute 34 | 35 | Pull Requests (PRs) are the main and exclusive way to contribute to the official Apollo project. 36 | In order for a PR to be accepted it needs to pass a list of requirements: 37 | 38 | - Code Coverage does not decrease. 39 | - All the tests pass. 40 | - The code is formatted according to [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/). 41 | - If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged. 42 | - If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality. 43 | - In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS.md). 44 | 45 | 46 | ### Format of the commit message 47 | 48 | The commit summary must start with a capital letter and with a verb in present tense. No dot in the end. 49 | 50 | ``` 51 | Add a feature 52 | Remove unused code 53 | Fix a bug 54 | ``` 55 | 56 | Every commit details should describe what was changed, under which context and, if applicable, the GitHub issue it relates to. 57 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 1 Letterman Drive 6 | Suite D4700 7 | San Francisco, CA, 94129 8 | 9 | Everyone is permitted to copy and distribute verbatim copies of this 10 | license document, but changing it is not allowed. 11 | 12 | 13 | Developer's Certificate of Origin 1.1 14 | 15 | By making a contribution to this project, I certify that: 16 | 17 | (a) The contribution was created in whole or in part by me and I 18 | have the right to submit it under the open source license 19 | indicated in the file; or 20 | 21 | (b) The contribution is based upon previous work that, to the best 22 | of my knowledge, is covered under an appropriate open source 23 | license and I have the right under that license to submit that 24 | work with modifications, whether created in whole or in part 25 | by me, under the same open source license (unless I am 26 | permitted to submit under a different license), as indicated 27 | in the file; or 28 | 29 | (c) The contribution was provided directly to me by some other 30 | person who certified (a), (b) or (c) and I have not modified 31 | it. 32 | 33 | (d) I understand and agree that this project and the contribution 34 | are public and that a record of the contribution (including all 35 | personal information I submit with it, including my sign-off) is 36 | maintained indefinitely and may be redistributed consistent with 37 | this project or the open source license(s) involved. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # The underlying base is ubuntu:16.04 2 | FROM nvidia/cuda:8.0-runtime 3 | 4 | # NVIDIA driver version must match the host! 5 | ENV DRIVER_VERSION 384.69 6 | RUN mkdir -p /opt/nvidia && cd /opt/nvidia/ \ 7 | && apt-get update && apt-get install -y wget module-init-tools && apt-get clean && rm -rf /var/lib/apt/lists/* \ 8 | && wget http://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run -O /opt/nvidia/driver.run \ 9 | && chmod +x /opt/nvidia/driver.run \ 10 | && /opt/nvidia/driver.run -s --no-nvidia-modprobe --no-kernel-module --no-nouveau-check --no-distro-scripts --no-opengl-files --no-kernel-module-source \ 11 | && rm -rf /opt/nvidia && apt-get purge -y module-init-tools && apt-get autoremove -y 12 | 13 | RUN apt-get update && \ 14 | apt-get install -y --no-install-suggests --no-install-recommends \ 15 | ca-certificates locales git python3 libpython3.5 python3-dev \ 16 | libgomp1 libxml2 libxml2-dev zlib1g-dev \ 17 | libsnappy1v5 libsnappy-dev libonig2 make gcc g++ curl openjdk-8-jre && \ 18 | curl https://bootstrap.pypa.io/get-pip.py | python3 && \ 19 | pip3 install --no-cache-dir PyStemmer bblfsh py4j==0.10.4 modelforge parquet jinja2 libMHCUDA datasketch cassandra_driver python-igraph numpy humanize pygments && \ 20 | apt-get remove -y python3-dev libxml2-dev libsnappy-dev zlib1g-dev make gcc g++ curl && \ 21 | apt-get remove -y *-doc *-man >/dev/null && \ 22 | apt-get autoremove -y && \ 23 | apt-get clean && \ 24 | rm -rf /var/lib/apt/lists/* && \ 25 | locale-gen en_US.UTF-8 26 | 27 | # sudo mount -o bind ... bundle/* 28 | ADD bundle/spark /spark/ 29 | ADD bundle/engine/python /bundle/sourced/engine/ 30 | ADD bundle/ml /bundle/sourced/ml/ 31 | 32 | ADD apollo/ /packages/apollo/apollo/ 33 | ADD setup.py /packages/apollo 34 | 35 | ENV PYTHONPATH /packages:/spark/python 36 | ENV LANG en_US.UTF-8 37 | WORKDIR /packages 38 | 39 | RUN echo '0.5.2' > /bundle/sourced/engine/version.txt && pip3 install -e /bundle/sourced/engine/ 40 | RUN pip3 install -e /bundle/sourced/ml/ 41 | RUN pip3 install --no-deps -e apollo/ && apollo warmup -s 'local[*]' 42 | 43 | ENTRYPOINT ["apollo"] 44 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ### GNU GENERAL PUBLIC LICENSE 2 | 3 | Version 3, 29 June 2007 4 | 5 | Copyright (C) 2007 Free Software Foundation, Inc. 6 | 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | ### Preamble 12 | 13 | The GNU General Public License is a free, copyleft license for 14 | software and other kinds of works. 15 | 16 | The licenses for most software and other practical works are designed 17 | to take away your freedom to share and change the works. By contrast, 18 | the GNU General Public License is intended to guarantee your freedom 19 | to share and change all versions of a program--to make sure it remains 20 | free software for all its users. We, the Free Software Foundation, use 21 | the GNU General Public License for most of our software; it applies 22 | also to any other work released this way by its authors. You can apply 23 | it to your programs, too. 24 | 25 | When we speak of free software, we are referring to freedom, not 26 | price. Our General Public Licenses are designed to make sure that you 27 | have the freedom to distribute copies of free software (and charge for 28 | them if you wish), that you receive source code or can get it if you 29 | want it, that you can change the software or use pieces of it in new 30 | free programs, and that you know you can do these things. 31 | 32 | To protect your rights, we need to prevent others from denying you 33 | these rights or asking you to surrender the rights. Therefore, you 34 | have certain responsibilities if you distribute copies of the 35 | software, or if you modify it: responsibilities to respect the freedom 36 | of others. 37 | 38 | For example, if you distribute copies of such a program, whether 39 | gratis or for a fee, you must pass on to the recipients the same 40 | freedoms that you received. You must make sure that they, too, receive 41 | or can get the source code. And you must show them these terms so they 42 | know their rights. 43 | 44 | Developers that use the GNU GPL protect your rights with two steps: 45 | (1) assert copyright on the software, and (2) offer you this License 46 | giving you legal permission to copy, distribute and/or modify it. 47 | 48 | For the developers' and authors' protection, the GPL clearly explains 49 | that there is no warranty for this free software. For both users' and 50 | authors' sake, the GPL requires that modified versions be marked as 51 | changed, so that their problems will not be attributed erroneously to 52 | authors of previous versions. 53 | 54 | Some devices are designed to deny users access to install or run 55 | modified versions of the software inside them, although the 56 | manufacturer can do so. This is fundamentally incompatible with the 57 | aim of protecting users' freedom to change the software. The 58 | systematic pattern of such abuse occurs in the area of products for 59 | individuals to use, which is precisely where it is most unacceptable. 60 | Therefore, we have designed this version of the GPL to prohibit the 61 | practice for those products. If such problems arise substantially in 62 | other domains, we stand ready to extend this provision to those 63 | domains in future versions of the GPL, as needed to protect the 64 | freedom of users. 65 | 66 | Finally, every program is threatened constantly by software patents. 67 | States should not allow patents to restrict development and use of 68 | software on general-purpose computers, but in those that do, we wish 69 | to avoid the special danger that patents applied to a free program 70 | could make it effectively proprietary. To prevent this, the GPL 71 | assures that patents cannot be used to render the program non-free. 72 | 73 | The precise terms and conditions for copying, distribution and 74 | modification follow. 75 | 76 | ### TERMS AND CONDITIONS 77 | 78 | #### 0. Definitions. 79 | 80 | "This License" refers to version 3 of the GNU General Public License. 81 | 82 | "Copyright" also means copyright-like laws that apply to other kinds 83 | of works, such as semiconductor masks. 84 | 85 | "The Program" refers to any copyrightable work licensed under this 86 | License. Each licensee is addressed as "you". "Licensees" and 87 | "recipients" may be individuals or organizations. 88 | 89 | To "modify" a work means to copy from or adapt all or part of the work 90 | in a fashion requiring copyright permission, other than the making of 91 | an exact copy. The resulting work is called a "modified version" of 92 | the earlier work or a work "based on" the earlier work. 93 | 94 | A "covered work" means either the unmodified Program or a work based 95 | on the Program. 96 | 97 | To "propagate" a work means to do anything with it that, without 98 | permission, would make you directly or secondarily liable for 99 | infringement under applicable copyright law, except executing it on a 100 | computer or modifying a private copy. Propagation includes copying, 101 | distribution (with or without modification), making available to the 102 | public, and in some countries other activities as well. 103 | 104 | To "convey" a work means any kind of propagation that enables other 105 | parties to make or receive copies. Mere interaction with a user 106 | through a computer network, with no transfer of a copy, is not 107 | conveying. 108 | 109 | An interactive user interface displays "Appropriate Legal Notices" to 110 | the extent that it includes a convenient and prominently visible 111 | feature that (1) displays an appropriate copyright notice, and (2) 112 | tells the user that there is no warranty for the work (except to the 113 | extent that warranties are provided), that licensees may convey the 114 | work under this License, and how to view a copy of this License. If 115 | the interface presents a list of user commands or options, such as a 116 | menu, a prominent item in the list meets this criterion. 117 | 118 | #### 1. Source Code. 119 | 120 | The "source code" for a work means the preferred form of the work for 121 | making modifications to it. "Object code" means any non-source form of 122 | a work. 123 | 124 | A "Standard Interface" means an interface that either is an official 125 | standard defined by a recognized standards body, or, in the case of 126 | interfaces specified for a particular programming language, one that 127 | is widely used among developers working in that language. 128 | 129 | The "System Libraries" of an executable work include anything, other 130 | than the work as a whole, that (a) is included in the normal form of 131 | packaging a Major Component, but which is not part of that Major 132 | Component, and (b) serves only to enable use of the work with that 133 | Major Component, or to implement a Standard Interface for which an 134 | implementation is available to the public in source code form. A 135 | "Major Component", in this context, means a major essential component 136 | (kernel, window system, and so on) of the specific operating system 137 | (if any) on which the executable work runs, or a compiler used to 138 | produce the work, or an object code interpreter used to run it. 139 | 140 | The "Corresponding Source" for a work in object code form means all 141 | the source code needed to generate, install, and (for an executable 142 | work) run the object code and to modify the work, including scripts to 143 | control those activities. However, it does not include the work's 144 | System Libraries, or general-purpose tools or generally available free 145 | programs which are used unmodified in performing those activities but 146 | which are not part of the work. For example, Corresponding Source 147 | includes interface definition files associated with source files for 148 | the work, and the source code for shared libraries and dynamically 149 | linked subprograms that the work is specifically designed to require, 150 | such as by intimate data communication or control flow between those 151 | subprograms and other parts of the work. 152 | 153 | The Corresponding Source need not include anything that users can 154 | regenerate automatically from other parts of the Corresponding Source. 155 | 156 | The Corresponding Source for a work in source code form is that same 157 | work. 158 | 159 | #### 2. Basic Permissions. 160 | 161 | All rights granted under this License are granted for the term of 162 | copyright on the Program, and are irrevocable provided the stated 163 | conditions are met. This License explicitly affirms your unlimited 164 | permission to run the unmodified Program. The output from running a 165 | covered work is covered by this License only if the output, given its 166 | content, constitutes a covered work. This License acknowledges your 167 | rights of fair use or other equivalent, as provided by copyright law. 168 | 169 | You may make, run and propagate covered works that you do not convey, 170 | without conditions so long as your license otherwise remains in force. 171 | You may convey covered works to others for the sole purpose of having 172 | them make modifications exclusively for you, or provide you with 173 | facilities for running those works, provided that you comply with the 174 | terms of this License in conveying all material for which you do not 175 | control copyright. Those thus making or running the covered works for 176 | you must do so exclusively on your behalf, under your direction and 177 | control, on terms that prohibit them from making any copies of your 178 | copyrighted material outside their relationship with you. 179 | 180 | Conveying under any other circumstances is permitted solely under the 181 | conditions stated below. Sublicensing is not allowed; section 10 makes 182 | it unnecessary. 183 | 184 | #### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 185 | 186 | No covered work shall be deemed part of an effective technological 187 | measure under any applicable law fulfilling obligations under article 188 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 189 | similar laws prohibiting or restricting circumvention of such 190 | measures. 191 | 192 | When you convey a covered work, you waive any legal power to forbid 193 | circumvention of technological measures to the extent such 194 | circumvention is effected by exercising rights under this License with 195 | respect to the covered work, and you disclaim any intention to limit 196 | operation or modification of the work as a means of enforcing, against 197 | the work's users, your or third parties' legal rights to forbid 198 | circumvention of technological measures. 199 | 200 | #### 4. Conveying Verbatim Copies. 201 | 202 | You may convey verbatim copies of the Program's source code as you 203 | receive it, in any medium, provided that you conspicuously and 204 | appropriately publish on each copy an appropriate copyright notice; 205 | keep intact all notices stating that this License and any 206 | non-permissive terms added in accord with section 7 apply to the code; 207 | keep intact all notices of the absence of any warranty; and give all 208 | recipients a copy of this License along with the Program. 209 | 210 | You may charge any price or no price for each copy that you convey, 211 | and you may offer support or warranty protection for a fee. 212 | 213 | #### 5. Conveying Modified Source Versions. 214 | 215 | You may convey a work based on the Program, or the modifications to 216 | produce it from the Program, in the form of source code under the 217 | terms of section 4, provided that you also meet all of these 218 | conditions: 219 | 220 | - a) The work must carry prominent notices stating that you modified 221 | it, and giving a relevant date. 222 | - b) The work must carry prominent notices stating that it is 223 | released under this License and any conditions added under 224 | section 7. This requirement modifies the requirement in section 4 225 | to "keep intact all notices". 226 | - c) You must license the entire work, as a whole, under this 227 | License to anyone who comes into possession of a copy. This 228 | License will therefore apply, along with any applicable section 7 229 | additional terms, to the whole of the work, and all its parts, 230 | regardless of how they are packaged. This License gives no 231 | permission to license the work in any other way, but it does not 232 | invalidate such permission if you have separately received it. 233 | - d) If the work has interactive user interfaces, each must display 234 | Appropriate Legal Notices; however, if the Program has interactive 235 | interfaces that do not display Appropriate Legal Notices, your 236 | work need not make them do so. 237 | 238 | A compilation of a covered work with other separate and independent 239 | works, which are not by their nature extensions of the covered work, 240 | and which are not combined with it such as to form a larger program, 241 | in or on a volume of a storage or distribution medium, is called an 242 | "aggregate" if the compilation and its resulting copyright are not 243 | used to limit the access or legal rights of the compilation's users 244 | beyond what the individual works permit. Inclusion of a covered work 245 | in an aggregate does not cause this License to apply to the other 246 | parts of the aggregate. 247 | 248 | #### 6. Conveying Non-Source Forms. 249 | 250 | You may convey a covered work in object code form under the terms of 251 | sections 4 and 5, provided that you also convey the machine-readable 252 | Corresponding Source under the terms of this License, in one of these 253 | ways: 254 | 255 | - a) Convey the object code in, or embodied in, a physical product 256 | (including a physical distribution medium), accompanied by the 257 | Corresponding Source fixed on a durable physical medium 258 | customarily used for software interchange. 259 | - b) Convey the object code in, or embodied in, a physical product 260 | (including a physical distribution medium), accompanied by a 261 | written offer, valid for at least three years and valid for as 262 | long as you offer spare parts or customer support for that product 263 | model, to give anyone who possesses the object code either (1) a 264 | copy of the Corresponding Source for all the software in the 265 | product that is covered by this License, on a durable physical 266 | medium customarily used for software interchange, for a price no 267 | more than your reasonable cost of physically performing this 268 | conveying of source, or (2) access to copy the Corresponding 269 | Source from a network server at no charge. 270 | - c) Convey individual copies of the object code with a copy of the 271 | written offer to provide the Corresponding Source. This 272 | alternative is allowed only occasionally and noncommercially, and 273 | only if you received the object code with such an offer, in accord 274 | with subsection 6b. 275 | - d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | - e) Convey the object code using peer-to-peer transmission, 288 | provided you inform other peers where the object code and 289 | Corresponding Source of the work are being offered to the general 290 | public at no charge under subsection 6d. 291 | 292 | A separable portion of the object code, whose source code is excluded 293 | from the Corresponding Source as a System Library, need not be 294 | included in conveying the object code work. 295 | 296 | A "User Product" is either (1) a "consumer product", which means any 297 | tangible personal property which is normally used for personal, 298 | family, or household purposes, or (2) anything designed or sold for 299 | incorporation into a dwelling. In determining whether a product is a 300 | consumer product, doubtful cases shall be resolved in favor of 301 | coverage. For a particular product received by a particular user, 302 | "normally used" refers to a typical or common use of that class of 303 | product, regardless of the status of the particular user or of the way 304 | in which the particular user actually uses, or expects or is expected 305 | to use, the product. A product is a consumer product regardless of 306 | whether the product has substantial commercial, industrial or 307 | non-consumer uses, unless such uses represent the only significant 308 | mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to 312 | install and execute modified versions of a covered work in that User 313 | Product from a modified version of its Corresponding Source. The 314 | information must suffice to ensure that the continued functioning of 315 | the modified object code is in no case prevented or interfered with 316 | solely because modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or 331 | updates for a work that has been modified or installed by the 332 | recipient, or for the User Product in which it has been modified or 333 | installed. Access to a network may be denied when the modification 334 | itself materially and adversely affects the operation of the network 335 | or violates the rules and protocols for communication across the 336 | network. 337 | 338 | Corresponding Source conveyed, and Installation Information provided, 339 | in accord with this section must be in a format that is publicly 340 | documented (and with an implementation available to the public in 341 | source code form), and must require no special password or key for 342 | unpacking, reading or copying. 343 | 344 | #### 7. Additional Terms. 345 | 346 | "Additional permissions" are terms that supplement the terms of this 347 | License by making exceptions from one or more of its conditions. 348 | Additional permissions that are applicable to the entire Program shall 349 | be treated as though they were included in this License, to the extent 350 | that they are valid under applicable law. If additional permissions 351 | apply only to part of the Program, that part may be used separately 352 | under those permissions, but the entire Program remains governed by 353 | this License without regard to the additional permissions. 354 | 355 | When you convey a copy of a covered work, you may at your option 356 | remove any additional permissions from that copy, or from any part of 357 | it. (Additional permissions may be written to require their own 358 | removal in certain cases when you modify the work.) You may place 359 | additional permissions on material, added by you to a covered work, 360 | for which you have or can give appropriate copyright permission. 361 | 362 | Notwithstanding any other provision of this License, for material you 363 | add to a covered work, you may (if authorized by the copyright holders 364 | of that material) supplement the terms of this License with terms: 365 | 366 | - a) Disclaiming warranty or limiting liability differently from the 367 | terms of sections 15 and 16 of this License; or 368 | - b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | - c) Prohibiting misrepresentation of the origin of that material, 372 | or requiring that modified versions of such material be marked in 373 | reasonable ways as different from the original version; or 374 | - d) Limiting the use for publicity purposes of names of licensors 375 | or authors of the material; or 376 | - e) Declining to grant rights under trademark law for use of some 377 | trade names, trademarks, or service marks; or 378 | - f) Requiring indemnification of licensors and authors of that 379 | material by anyone who conveys the material (or modified versions 380 | of it) with contractual assumptions of liability to the recipient, 381 | for any liability that these contractual assumptions directly 382 | impose on those licensors and authors. 383 | 384 | All other non-permissive additional terms are considered "further 385 | restrictions" within the meaning of section 10. If the Program as you 386 | received it, or any part of it, contains a notice stating that it is 387 | governed by this License along with a term that is a further 388 | restriction, you may remove that term. If a license document contains 389 | a further restriction but permits relicensing or conveying under this 390 | License, you may add to a covered work material governed by the terms 391 | of that license document, provided that the further restriction does 392 | not survive such relicensing or conveying. 393 | 394 | If you add terms to a covered work in accord with this section, you 395 | must place, in the relevant source files, a statement of the 396 | additional terms that apply to those files, or a notice indicating 397 | where to find the applicable terms. 398 | 399 | Additional terms, permissive or non-permissive, may be stated in the 400 | form of a separately written license, or stated as exceptions; the 401 | above requirements apply either way. 402 | 403 | #### 8. Termination. 404 | 405 | You may not propagate or modify a covered work except as expressly 406 | provided under this License. Any attempt otherwise to propagate or 407 | modify it is void, and will automatically terminate your rights under 408 | this License (including any patent licenses granted under the third 409 | paragraph of section 11). 410 | 411 | However, if you cease all violation of this License, then your license 412 | from a particular copyright holder is reinstated (a) provisionally, 413 | unless and until the copyright holder explicitly and finally 414 | terminates your license, and (b) permanently, if the copyright holder 415 | fails to notify you of the violation by some reasonable means prior to 416 | 60 days after the cessation. 417 | 418 | Moreover, your license from a particular copyright holder is 419 | reinstated permanently if the copyright holder notifies you of the 420 | violation by some reasonable means, this is the first time you have 421 | received notice of violation of this License (for any work) from that 422 | copyright holder, and you cure the violation prior to 30 days after 423 | your receipt of the notice. 424 | 425 | Termination of your rights under this section does not terminate the 426 | licenses of parties who have received copies or rights from you under 427 | this License. If your rights have been terminated and not permanently 428 | reinstated, you do not qualify to receive new licenses for the same 429 | material under section 10. 430 | 431 | #### 9. Acceptance Not Required for Having Copies. 432 | 433 | You are not required to accept this License in order to receive or run 434 | a copy of the Program. Ancillary propagation of a covered work 435 | occurring solely as a consequence of using peer-to-peer transmission 436 | to receive a copy likewise does not require acceptance. However, 437 | nothing other than this License grants you permission to propagate or 438 | modify any covered work. These actions infringe copyright if you do 439 | not accept this License. Therefore, by modifying or propagating a 440 | covered work, you indicate your acceptance of this License to do so. 441 | 442 | #### 10. Automatic Licensing of Downstream Recipients. 443 | 444 | Each time you convey a covered work, the recipient automatically 445 | receives a license from the original licensors, to run, modify and 446 | propagate that work, subject to this License. You are not responsible 447 | for enforcing compliance by third parties with this License. 448 | 449 | An "entity transaction" is a transaction transferring control of an 450 | organization, or substantially all assets of one, or subdividing an 451 | organization, or merging organizations. If propagation of a covered 452 | work results from an entity transaction, each party to that 453 | transaction who receives a copy of the work also receives whatever 454 | licenses to the work the party's predecessor in interest had or could 455 | give under the previous paragraph, plus a right to possession of the 456 | Corresponding Source of the work from the predecessor in interest, if 457 | the predecessor has it or can get it with reasonable efforts. 458 | 459 | You may not impose any further restrictions on the exercise of the 460 | rights granted or affirmed under this License. For example, you may 461 | not impose a license fee, royalty, or other charge for exercise of 462 | rights granted under this License, and you may not initiate litigation 463 | (including a cross-claim or counterclaim in a lawsuit) alleging that 464 | any patent claim is infringed by making, using, selling, offering for 465 | sale, or importing the Program or any portion of it. 466 | 467 | #### 11. Patents. 468 | 469 | A "contributor" is a copyright holder who authorizes use under this 470 | License of the Program or a work on which the Program is based. The 471 | work thus licensed is called the contributor's "contributor version". 472 | 473 | A contributor's "essential patent claims" are all patent claims owned 474 | or controlled by the contributor, whether already acquired or 475 | hereafter acquired, that would be infringed by some manner, permitted 476 | by this License, of making, using, or selling its contributor version, 477 | but do not include claims that would be infringed only as a 478 | consequence of further modification of the contributor version. For 479 | purposes of this definition, "control" includes the right to grant 480 | patent sublicenses in a manner consistent with the requirements of 481 | this License. 482 | 483 | Each contributor grants you a non-exclusive, worldwide, royalty-free 484 | patent license under the contributor's essential patent claims, to 485 | make, use, sell, offer for sale, import and otherwise run, modify and 486 | propagate the contents of its contributor version. 487 | 488 | In the following three paragraphs, a "patent license" is any express 489 | agreement or commitment, however denominated, not to enforce a patent 490 | (such as an express permission to practice a patent or covenant not to 491 | sue for patent infringement). To "grant" such a patent license to a 492 | party means to make such an agreement or commitment not to enforce a 493 | patent against the party. 494 | 495 | If you convey a covered work, knowingly relying on a patent license, 496 | and the Corresponding Source of the work is not available for anyone 497 | to copy, free of charge and under the terms of this License, through a 498 | publicly available network server or other readily accessible means, 499 | then you must either (1) cause the Corresponding Source to be so 500 | available, or (2) arrange to deprive yourself of the benefit of the 501 | patent license for this particular work, or (3) arrange, in a manner 502 | consistent with the requirements of this License, to extend the patent 503 | license to downstream recipients. "Knowingly relying" means you have 504 | actual knowledge that, but for the patent license, your conveying the 505 | covered work in a country, or your recipient's use of the covered work 506 | in a country, would infringe one or more identifiable patents in that 507 | country that you have reason to believe are valid. 508 | 509 | If, pursuant to or in connection with a single transaction or 510 | arrangement, you convey, or propagate by procuring conveyance of, a 511 | covered work, and grant a patent license to some of the parties 512 | receiving the covered work authorizing them to use, propagate, modify 513 | or convey a specific copy of the covered work, then the patent license 514 | you grant is automatically extended to all recipients of the covered 515 | work and works based on it. 516 | 517 | A patent license is "discriminatory" if it does not include within the 518 | scope of its coverage, prohibits the exercise of, or is conditioned on 519 | the non-exercise of one or more of the rights that are specifically 520 | granted under this License. You may not convey a covered work if you 521 | are a party to an arrangement with a third party that is in the 522 | business of distributing software, under which you make payment to the 523 | third party based on the extent of your activity of conveying the 524 | work, and under which the third party grants, to any of the parties 525 | who would receive the covered work from you, a discriminatory patent 526 | license (a) in connection with copies of the covered work conveyed by 527 | you (or copies made from those copies), or (b) primarily for and in 528 | connection with specific products or compilations that contain the 529 | covered work, unless you entered into that arrangement, or that patent 530 | license was granted, prior to 28 March 2007. 531 | 532 | Nothing in this License shall be construed as excluding or limiting 533 | any implied license or other defenses to infringement that may 534 | otherwise be available to you under applicable patent law. 535 | 536 | #### 12. No Surrender of Others' Freedom. 537 | 538 | If conditions are imposed on you (whether by court order, agreement or 539 | otherwise) that contradict the conditions of this License, they do not 540 | excuse you from the conditions of this License. If you cannot convey a 541 | covered work so as to satisfy simultaneously your obligations under 542 | this License and any other pertinent obligations, then as a 543 | consequence you may not convey it at all. For example, if you agree to 544 | terms that obligate you to collect a royalty for further conveying 545 | from those to whom you convey the Program, the only way you could 546 | satisfy both those terms and this License would be to refrain entirely 547 | from conveying the Program. 548 | 549 | #### 13. Use with the GNU Affero General Public License. 550 | 551 | Notwithstanding any other provision of this License, you have 552 | permission to link or combine any covered work with a work licensed 553 | under version 3 of the GNU Affero General Public License into a single 554 | combined work, and to convey the resulting work. The terms of this 555 | License will continue to apply to the part which is the covered work, 556 | but the special requirements of the GNU Affero General Public License, 557 | section 13, concerning interaction through a network will apply to the 558 | combination as such. 559 | 560 | #### 14. Revised Versions of this License. 561 | 562 | The Free Software Foundation may publish revised and/or new versions 563 | of the GNU General Public License from time to time. Such new versions 564 | will be similar in spirit to the present version, but may differ in 565 | detail to address new problems or concerns. 566 | 567 | Each version is given a distinguishing version number. If the Program 568 | specifies that a certain numbered version of the GNU General Public 569 | License "or any later version" applies to it, you have the option of 570 | following the terms and conditions either of that numbered version or 571 | of any later version published by the Free Software Foundation. If the 572 | Program does not specify a version number of the GNU General Public 573 | License, you may choose any version ever published by the Free 574 | Software Foundation. 575 | 576 | If the Program specifies that a proxy can decide which future versions 577 | of the GNU General Public License can be used, that proxy's public 578 | statement of acceptance of a version permanently authorizes you to 579 | choose that version for the Program. 580 | 581 | Later license versions may give you additional or different 582 | permissions. However, no additional obligations are imposed on any 583 | author or copyright holder as a result of your choosing to follow a 584 | later version. 585 | 586 | #### 15. Disclaimer of Warranty. 587 | 588 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 589 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 590 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT 591 | WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT 592 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 593 | A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND 594 | PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE 595 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR 596 | CORRECTION. 597 | 598 | #### 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR 602 | CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 603 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES 604 | ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT 605 | NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR 606 | LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM 607 | TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER 608 | PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 609 | 610 | #### 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | ### How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these 626 | terms. 627 | 628 | To do so, attach the following notices to the program. It is safest to 629 | attach them to the start of each source file to most effectively state 630 | the exclusion of warranty; and each file should have at least the 631 | "copyright" line and a pointer to where the full notice is found. 632 | 633 | 634 | Copyright (C) 635 | 636 | This program is free software: you can redistribute it and/or modify 637 | it under the terms of the GNU General Public License as published by 638 | the Free Software Foundation, either version 3 of the License, or 639 | (at your option) any later version. 640 | 641 | This program is distributed in the hope that it will be useful, 642 | but WITHOUT ANY WARRANTY; without even the implied warranty of 643 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 644 | GNU General Public License for more details. 645 | 646 | You should have received a copy of the GNU General Public License 647 | along with this program. If not, see . 648 | 649 | Also add information on how to contact you by electronic and paper 650 | mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands \`show w' and \`show c' should show the 661 | appropriate parts of the General Public License. Of course, your 662 | program's commands might be different; for a GUI interface, you would 663 | use an "about box". 664 | 665 | You should also get your employer (if you work as a programmer) or 666 | school, if any, to sign a "copyright disclaimer" for the program, if 667 | necessary. For more information on this, and how to apply and follow 668 | the GNU GPL, see . 669 | 670 | The GNU General Public License does not permit incorporating your 671 | program into proprietary programs. If your program is a subroutine 672 | library, you may consider it more useful to permit linking proprietary 673 | applications with the library. If this is what you want to do, use the 674 | GNU Lesser General Public License instead of this License. But first, 675 | please read . -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Vadim Markovtsev (@vmarkovtsev) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Apollo 2 | ====== 3 | 4 | Advanced code deduplicator. Powered by [source\{d\} ML](https://github.com/src-d/ml), 5 | [source\{d\} engine](https://github.com/src-d/engine) and [minhashcuda](https://github.com/src-d/minhashcuda). 6 | Agnostic to the analysed language thanks to [Babelfish](https://doc.bblf.sh). Python 3, PySpark, CUDA inside. 7 | 8 | ### What is this? 9 | 10 | source{d}'s effort to research and solve the code deduplication problem. At scale, as usual. 11 | A [code clone](https://en.wikipedia.org/wiki/Duplicate_code) is several snippets of code with few differences. 12 | For now this project focuses on find near-duplicate projects and files; it will eventually support 13 | functions and snippets in the future. 14 | 15 | ### Should I use it? 16 | 17 | If you've got hundreds of thousands of files or more, consider. Otherwise, use one of the many 18 | existing tools which may be already integrated into your IDE. 19 | 20 | ### Difference from [src-d/gemini](https://github.com/src-d/gemini)? 21 | 22 | This guy is my brother. Apollo focuses on research, extensibility, flexibility and rapid 23 | changes, while Gemini focuses on performance and serious production usage. All the proven and 24 | tested features will be eventually ported to Gemini. At the same time, Gemini may reuse some 25 | of Apollo's code. 26 | 27 | ### Algorithm 28 | 29 | Apollo takes the "hash'em all" approach. We extract unordered weighted features from code aka "weighted bags", 30 | apply [Weighted MinHash](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf) 31 | and then design the [Locality Sensitive Hashing index](http://infolab.stanford.edu/~ullman/mmds/ch3.pdf). 32 | All items which appear in the same hashtable bucket are considered the same. The size of the hash 33 | and the number of hashtables depend on the [weighted Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index#Generalized_Jaccard_similarity_and_distance) 34 | threshold (hence Weighted MinHash). 35 | 36 | The features include identifiers such as variable, function or class names, literal values and *structural elements*. 37 | The latter carries the topological information, and we currently support several variants: "node2vec", 38 | "deterministic node2vec" and "role-children atoms". Graphlets are upcoming. Different features 39 | have different weights which will be tuned by a hyperparameter optimization algorithm or even an SGD 40 | (not yet implemented). 41 | 42 | It's not all unfortunately! Dumping the huge graph of pairwise similarities is of little practicality. 43 | We need to group (cluster) the neighborhoods of densely connected nodes. Apollo solves this problem 44 | in two steps: 45 | 46 | 1. Run [connected components](https://en.wikipedia.org/wiki/Connected_component_(graph_theory)) 47 | analysis to find disjoint parts in the similarity graph. 48 | 2. Run [community detection](https://en.wikipedia.org/wiki/Community_structure) to cluster the components. 49 | The clusters are with overlaps. 50 | 51 | ### Implementation 52 | 53 | Apollo is structured as a series of commands in CLI. It stores data in [Cassandra](http://cassandra.apache.org/) 54 | (compatible with [Scylla](http://www.scylladb.com/)) and 55 | writes MinHashCuda batches on disk. Community detection is delegated to [igraph](http://igraph.org/python/). 56 | 57 | * `resetdb` (erases) and initializes a Cassandra keyspace. 58 | * `bags` extracts the features, stores them in the database and writes MinHashCuda batches on disk. 59 | Runs source{d} engine through PySpark. 60 | * `hash` performs the hashing, writes the hashtables to the database and hashing parameters on disk 61 | in [Modelforge](https://github.com/src-d/modelforge) format. 62 | * `cc` fetches the buckets, runs the connected component analysis and writes the result on disk in Modelforge 63 | format. Uses PySpark. 64 | * `cmd` reads the connected components and performs the community detection (by default, walktrap). 65 | Uses PySpark. 66 | * `query` outputs items similar to the specified. In case of files, the path or the sha1 are accepted. 67 | * `dumpcmd` outputs the groups of similar items. 68 | 69 | ### Installation 70 | 71 | ``` 72 | mount -o bind /path/to/sourced-ml bundle/ml 73 | mount -o bind /path/to/spark-2.2.0-bin-hadoop2.7 bundle/spark 74 | mount -o bind /path/to/sourced-engine bundle/engine 75 | docker build -t srcd/apollo . 76 | docker run --name scylla -p 9042:9042 -v /var/lib/scylla:/var/lib/scylla -d scylladb/scylla --developer-mode=1 77 | docker run -it --rm --link scylla srcd/apollo resetdb --cassandra scylla 78 | docker run -d --name bblfshd --privileged -p 9432:9432 -v /var/lib/bblfshd:/var/lib/bblfshd bblfsh/bblfshd 79 | docker exec -it bblfshd bblfshctl driver install --all 80 | ``` 81 | 82 | You are going to need [grip](https://github.com/joeyespo/grip) to instantly render Markdown reports 83 | in your browser. There multiple Docker options available, e.g. 84 | [1](https://github.com/psycofdj/docker-grip), [2](https://github.com/fstab/docker-grip), 85 | [3](https://github.com/kba/grip-docker). 86 | 87 | ### Contributions 88 | 89 | ...are welcome! See [CONTRIBUTING](CONTRIBUTING.md) and [code of conduct](CODE_OF_CONDUCT.md). 90 | 91 | ### License 92 | 93 | [GPL](LICENSE.md). 94 | 95 | ## Docker command snippets 96 | 97 | ### Bags 98 | 99 | ``` 100 | docker run -it --rm -v /path/to/io:/io --link bblfshd --link scylla srcd/apollo bags -r /io/siva \ 101 | --bow /io/bags/bow.asdf --docfreq /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \ 102 | -l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --cassandra scylla --persist MEMORY_ONLY \ 103 | --config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G 104 | ``` 105 | 106 | ### Hash 107 | 108 | ``` 109 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo hash /io/batches/bow*.asdf -p /io/bags/params.asdf \ 110 | -t 0.8 --cassandra scylla 111 | ``` 112 | 113 | ### Query sha1 114 | 115 | ``` 116 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo query -i --precise \ 117 | --docfreq /io/bags/docfreq.asdf -t 0.8 --cassandra scylla 118 | ``` 119 | 120 | ### Query file 121 | 122 | ``` 123 | docker run -it --rm -v /path/to/io:/io -v .:/q --link bblfshd --link scylla srcd/apollo query \ 124 | -f /q/myfile.java --bblfsh bblfshd --cassandra scylla --precise --docfreq /io/docfreq.asdf \ 125 | --params /io/params.asdf -t 0.9 | grip -b - 126 | ``` 127 | 128 | ### Connected components 129 | 130 | ``` 131 | docker run -it --rm -v /path/to/io:/io --link scylla srcd/apollo cc -o /io/ccs.asdf 132 | ``` 133 | 134 | ### Dump connected components 135 | 136 | ``` 137 | docker run -it --rm -v /path/to/io:/io srcd/apollo dumpcc -o /io/ccs.asdf 138 | ``` 139 | 140 | ### Community detection 141 | 142 | ``` 143 | docker run -it --rm -v /path/to/io:/io srcd/apollo cmd -i /io/ccs.asdf -o /io/communities.asdf -s 'local[*]' 144 | ``` 145 | 146 | ### Dump communities (final report) 147 | 148 | ``` 149 | docker run -it --rm -v /path/to/io:/io srcd/apollo dumpcmd /io/communities.asdf | grip -b - 150 | ``` 151 | -------------------------------------------------------------------------------- /apollo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/apollo/fcdf67bb579681bbf978168e909cd74207ed06db/apollo/__init__.py -------------------------------------------------------------------------------- /apollo/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import sys 5 | from time import time 6 | 7 | from igraph import Graph 8 | from modelforge.logs import setup_logging 9 | from sourced.ml import extractors 10 | from sourced.ml.utils import add_engine_args, add_spark_args 11 | from sourced.ml.cmd import ArgumentDefaultsHelpFormatterNoNone 12 | from sourced.ml.cmd.args import add_bow_args, add_feature_args, add_repo2_args, \ 13 | add_df_args, add_repartitioner_arg 14 | 15 | from apollo.bags import preprocess, source2bags 16 | from apollo.cassandra_utils import reset_db 17 | from apollo.graph import find_connected_components, dumpcc, detect_communities, dumpcmd, \ 18 | evaluate_communities 19 | from apollo.hasher import hash_batches 20 | from apollo.query import query 21 | from apollo.warmup import warmup 22 | 23 | 24 | CASSANDRA_PACKAGE = "com.datastax.spark:spark-cassandra-connector_2.11:2.0.3" 25 | 26 | 27 | def get_parser() -> argparse.ArgumentParser: 28 | """ 29 | Create the cmdline argument parser. 30 | """ 31 | parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone) 32 | parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, 33 | help="Logging verbosity.") 34 | 35 | def add_feature_weight_arg(my_parser): 36 | help_desc = "%s's weight - all features from this extractor will be multiplied by this " \ 37 | "factor" 38 | for ex in extractors.__extractors__.values(): 39 | my_parser.add_argument("--%s-weight" % ex.NAME, default=1, type=float, 40 | help=help_desc % ex.__name__) 41 | 42 | def add_cassandra_args(my_parser): 43 | my_parser.add_argument( 44 | "--cassandra", default="0.0.0.0:9042", help="Cassandra's host:port.") 45 | my_parser.add_argument("--keyspace", default="apollo", 46 | help="Cassandra's key space.") 47 | my_parser.add_argument( 48 | "--tables", help="Table name mapping (JSON): bags, hashes, hashtables, hashtables2.") 49 | 50 | def add_wmh_args(my_parser, params_help: str, add_hash_size: bool, required: bool): 51 | if add_hash_size: 52 | my_parser.add_argument("--size", type=int, default=128, help="Hash size.") 53 | my_parser.add_argument("-p", "--params", required=required, help=params_help) 54 | my_parser.add_argument("-t", "--threshold", required=required, type=float, 55 | help="Jaccard similarity threshold.") 56 | my_parser.add_argument("--false-positive-weight", type=float, default=0.5, 57 | help="Used to adjust the relative importance of " 58 | "minimizing false positives count when optimizing " 59 | "for the Jaccard similarity threshold.") 60 | my_parser.add_argument("--false-negative-weight", type=float, default=0.5, 61 | help="Used to adjust the relative importance of " 62 | "minimizing false negatives count when optimizing " 63 | "for the Jaccard similarity threshold.") 64 | 65 | def add_template_args(my_parser, default_template): 66 | my_parser.add_argument("--batch", type=int, default=100, 67 | help="Number of hashes to query at a time.") 68 | my_parser.add_argument("--template", default=default_template, 69 | help="Jinja2 template to render.") 70 | 71 | # Create and construct subparsers 72 | subparsers = parser.add_subparsers(help="Commands", dest="command") 73 | 74 | # ------------------------------------------------------------------------ 75 | warmup_parser = subparsers.add_parser( 76 | "warmup", help="Initialize source{d} engine.") 77 | warmup_parser.set_defaults(handler=warmup) 78 | add_engine_args(warmup_parser, default_packages=[CASSANDRA_PACKAGE]) 79 | 80 | # ------------------------------------------------------------------------ 81 | db_parser = subparsers.add_parser("resetdb", help="Destructively initialize the database.") 82 | db_parser.set_defaults(handler=reset_db) 83 | add_cassandra_args(db_parser) 84 | db_parser.add_argument( 85 | "--hashes-only", action="store_true", 86 | help="Only clear the tables: hashes, hashtables, hashtables2. Do not touch the rest.") 87 | # ------------------------------------------------------------------------ 88 | preprocess_parser = subparsers.add_parser( 89 | "preprocess", help="Creates the index, quant and docfreq model of the bag-of-words model.") 90 | preprocess_parser.set_defaults(handler=preprocess) 91 | add_df_args(preprocess_parser) 92 | add_repo2_args(preprocess_parser) 93 | add_feature_args(preprocess_parser) 94 | add_repartitioner_arg(preprocess_parser) 95 | preprocess_parser.add_argument( 96 | "--cached-index-path", default=None, 97 | help="[OUT] Path to the docfreq model holding the document's index.") 98 | # ------------------------------------------------------------------------ 99 | source2bags_parser = subparsers.add_parser( 100 | "bags", help="Convert source code to weighted sets.") 101 | source2bags_parser.set_defaults(handler=source2bags) 102 | add_bow_args(source2bags_parser) 103 | add_repo2_args(source2bags_parser, default_packages=[CASSANDRA_PACKAGE]) 104 | add_feature_args(source2bags_parser) 105 | add_cassandra_args(source2bags_parser) 106 | add_df_args(source2bags_parser) 107 | add_repartitioner_arg(source2bags_parser) 108 | source2bags_parser.add_argument( 109 | "--cached-index-path", default=None, 110 | help="[IN] Path to the docfreq model holding the document's index.") 111 | 112 | # ------------------------------------------------------------------------ 113 | hash_parser = subparsers.add_parser( 114 | "hash", help="Run MinHashCUDA on the bag batches.") 115 | hash_parser.set_defaults(handler=hash_batches) 116 | hash_parser.add_argument("-i", "--input", 117 | help="Path to the directory with Parquet files.") 118 | hash_parser.add_argument("--seed", type=int, default=int(time()), 119 | help="Random generator's seed.") 120 | hash_parser.add_argument("--mhc-verbosity", type=int, default=1, 121 | help="MinHashCUDA logs verbosity level.") 122 | hash_parser.add_argument("--devices", type=int, default=0, 123 | help="Or-red indices of NVIDIA devices to use. 0 means all.") 124 | add_wmh_args(hash_parser, "Path to the output file with WMH parameters.", True, True) 125 | add_cassandra_args(hash_parser) 126 | add_spark_args(hash_parser, default_packages=[CASSANDRA_PACKAGE]) 127 | add_feature_weight_arg(hash_parser) 128 | add_repartitioner_arg(hash_parser) 129 | 130 | # ------------------------------------------------------------------------ 131 | query_parser = subparsers.add_parser("query", help="Query for similar files.") 132 | query_parser.set_defaults(handler=query) 133 | mode_group = query_parser.add_mutually_exclusive_group(required=True) 134 | mode_group.add_argument("-i", "--id", help="Query for this id (id mode).") 135 | mode_group.add_argument("-c", "--file", help="Query for this file (file mode).") 136 | query_parser.add_argument("--docfreq", help="Path to OrderedDocumentFrequencies (file mode).") 137 | query_parser.add_argument("--min-docfreq", default=1, type=int, 138 | help="The minimum document frequency of each feature.") 139 | query_parser.add_argument( 140 | "--bblfsh", default="localhost:9432", help="Babelfish server's address.") 141 | query_parser.add_argument("--precise", action="store_true", 142 | help="Calculate the precise set.") 143 | add_wmh_args(query_parser, "Path to the Weighted MinHash parameters.", False, False) 144 | add_feature_args(query_parser, required=False) 145 | add_template_args(query_parser, "query.md.jinja2") 146 | add_cassandra_args(query_parser) 147 | 148 | # ------------------------------------------------------------------------ 149 | cc_parser = subparsers.add_parser( 150 | "cc", help="Load the similar pairs of files and run connected components analysis.") 151 | cc_parser.set_defaults(handler=find_connected_components) 152 | add_cassandra_args(cc_parser) 153 | cc_parser.add_argument("-o", "--output", required=True, 154 | help="[OUT] Path to connected components ASDF model.") 155 | 156 | # ------------------------------------------------------------------------ 157 | dumpcc_parser = subparsers.add_parser( 158 | "dumpcc", help="Output the connected components to stdout.") 159 | dumpcc_parser.set_defaults(handler=dumpcc) 160 | dumpcc_parser.add_argument("-i", "--input", required=True, 161 | help="Path to connected components ASDF model.") 162 | # ------------------------------------------------------------------------ 163 | community_parser = subparsers.add_parser( 164 | "cmd", help="Run Community Detection analysis on the connected components from \"cc\".") 165 | community_parser.set_defaults(handler=detect_communities) 166 | community_parser.add_argument("-i", "--input", required=True, 167 | help="Path to connected components ASDF model.") 168 | community_parser.add_argument("-o", "--output", required=True, 169 | help="[OUT] Path to the communities ASDF model.") 170 | community_parser.add_argument("--edges", choices=("linear", "quadratic", "1", "2"), 171 | default="linear", 172 | help="The method to generate the graph's edges: bipartite - " 173 | "linear and fast, but may not fit some the CD algorithms, " 174 | "or all to all within a bucket - quadratic and slow, but " 175 | "surely fits all the algorithms.") 176 | cmd_choices = [k[10:] for k in dir(Graph) if k.startswith("community_")] 177 | community_parser.add_argument("-a", "--algorithm", choices=cmd_choices, 178 | default="walktrap", 179 | help="The community detection algorithm to apply.") 180 | community_parser.add_argument("-p", "--params", type=json.loads, default={}, 181 | help="Parameters for the algorithm (**kwargs, JSON format).") 182 | community_parser.add_argument("--no-spark", action="store_true", help="Do not use Spark.") 183 | add_spark_args(community_parser) 184 | 185 | # ------------------------------------------------------------------------ 186 | dumpcmd_parser = subparsers.add_parser( 187 | "dumpcmd", help="Output the detected communities to stdout.") 188 | dumpcmd_parser.set_defaults(handler=dumpcmd) 189 | dumpcmd_parser.add_argument("input", help="Path to the communities ASDF model.") 190 | add_template_args(dumpcmd_parser, "report.md.jinja2") 191 | add_cassandra_args(dumpcmd_parser) 192 | 193 | # ------------------------------------------------------------------------ 194 | evalcc_parser = subparsers.add_parser( 195 | "evalcc", help="Evaluate the communities: calculate the precise similarity and the " 196 | "fitness metric.") 197 | evalcc_parser.set_defaults(handler=evaluate_communities) 198 | evalcc_parser.add_argument("-t", "--threshold", required=True, type=float, 199 | help="Jaccard similarity threshold.") 200 | evalcc_parser.add_argument("-i", "--input", required=True, 201 | help="Path to the communities model.") 202 | 203 | add_spark_args(evalcc_parser, default_packages=[CASSANDRA_PACKAGE]) 204 | add_cassandra_args(evalcc_parser) 205 | 206 | # TODO: retable [.....] -> [.] [.] [.] [.] [.] 207 | return parser 208 | 209 | 210 | def main(): 211 | """ 212 | Creates all the argument parsers and invokes the function from set_defaults(). 213 | 214 | :return: The result of the function from set_defaults(). 215 | """ 216 | parser = get_parser() 217 | args = parser.parse_args() 218 | args.log_level = logging._nameToLevel[args.log_level] 219 | setup_logging(args.log_level) 220 | try: 221 | handler = args.handler 222 | except AttributeError: 223 | def print_usage(_): 224 | parser.print_usage() 225 | 226 | handler = print_usage 227 | return handler(args) 228 | 229 | 230 | if __name__ == "__main__": 231 | sys.exit(main()) 232 | -------------------------------------------------------------------------------- /apollo/bags.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import Row 2 | from sourced.ml.cmd import repos2bow_template, repos2bow_index_template 3 | from sourced.ml.transformers import Transformer 4 | 5 | from apollo import cassandra_utils 6 | 7 | 8 | class BagsSaver(Transformer): 9 | def __init__(self, keyspace, table, **kwargs): 10 | super().__init__(**kwargs) 11 | self.keyspace = keyspace 12 | self.table = table 13 | 14 | def __call__(self, head): 15 | rows = head.map(lambda row: Row(sha1=row.document, 16 | item=row.token, 17 | value=float(row.value))) 18 | if self.explained: 19 | self._log.info("toDebugString():\n%s", rows.toDebugString().decode()) 20 | rows.toDF() \ 21 | .write \ 22 | .format("org.apache.spark.sql.cassandra") \ 23 | .mode("append") \ 24 | .options(table=self.table, keyspace=self.keyspace) \ 25 | .save() 26 | return head 27 | 28 | 29 | class MetadataSaver(Transformer): 30 | def __init__(self, keyspace, table, **kwargs): 31 | super().__init__(**kwargs) 32 | self.keyspace = keyspace 33 | self.table = table 34 | 35 | def __call__(self, head): 36 | rows = head.map(lambda x: Row( 37 | sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path)) 38 | if self.explained: 39 | self._log.info("toDebugString():\n%s", rows.toDebugString().decode()) 40 | rows.toDF() \ 41 | .write \ 42 | .format("org.apache.spark.sql.cassandra") \ 43 | .mode("append") \ 44 | .options(table=self.table, keyspace=self.keyspace) \ 45 | .save() 46 | 47 | 48 | def preprocess(args): 49 | return repos2bow_index_template(args) 50 | 51 | 52 | def source2bags(args): 53 | cassandra_utils.configure(args) 54 | return repos2bow_template( 55 | args, 56 | cache_hook=lambda: MetadataSaver(args.keyspace, args.tables["meta"]), 57 | save_hook=lambda: BagsSaver(args.keyspace, args.tables["bags"])) 58 | -------------------------------------------------------------------------------- /apollo/cassandra_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | import json 4 | import platform 5 | import re 6 | from typing import Iterable 7 | 8 | import modelforge.logs 9 | from cassandra.cluster import Cluster, Session, NoHostAvailable 10 | from cassandra.policies import RoundRobinPolicy 11 | 12 | 13 | def patch_tables(args): 14 | if args.tables and isinstance(args.tables, str): 15 | tables = args.tables 16 | else: 17 | tables = "" 18 | defaults = ("bags", "meta", "hashes", "hashtables", "hashtables2") 19 | args.tables = {n: n for n in defaults} 20 | if tables: 21 | args.tables.update(json.loads(tables)) 22 | 23 | 24 | def configure(args): 25 | try: 26 | cas_host, cas_port = args.cassandra.split(":") 27 | except ValueError: 28 | cas_host = args.cassandra 29 | cas_port = "9042" 30 | args.config.append("spark.cassandra.connection.host=" + cas_host) 31 | args.config.append("spark.cassandra.connection.port=" + cas_port) 32 | patch_tables(args) 33 | return args 34 | 35 | 36 | def get_db(args): 37 | log = logging.getLogger("cassandra") 38 | patch_tables(args) 39 | try: 40 | cas_host, cas_port = args.cassandra.split(":") 41 | except ValueError: 42 | cas_host = args.cassandra 43 | cas_port = "9042" 44 | 45 | def get_cluster(): 46 | return Cluster((cas_host,), port=int(cas_port), 47 | load_balancing_policy=RoundRobinPolicy()) 48 | cluster = get_cluster() 49 | log.info("Connecting to %s", args.cassandra) 50 | try: 51 | session = cluster.connect(args.keyspace) 52 | except NoHostAvailable: 53 | log.warning("Keyspace %s does not exist", args.keyspace) 54 | cluster = get_cluster() 55 | session = cluster.connect() 56 | return session 57 | 58 | 59 | def reset_db(args): 60 | db = get_db(args) 61 | 62 | def cql(cmd): 63 | print(cmd + ";") 64 | db.execute(cmd) 65 | 66 | if not args.hashes_only: 67 | cql("DROP KEYSPACE IF EXISTS %s" % args.keyspace) 68 | cql("CREATE KEYSPACE %s WITH REPLICATION = {" 69 | "'class' : 'SimpleStrategy', 'replication_factor' : 1}" % args.keyspace) 70 | print("USE %s;" % args.keyspace) 71 | db.set_keyspace(args.keyspace) 72 | tables = args.tables 73 | if not args.hashes_only: 74 | cql("CREATE TABLE %s (sha1 ascii, item ascii, value float, PRIMARY KEY (sha1, item))" 75 | % tables["bags"]) 76 | cql("CREATE TABLE %s (sha1 varchar, repo varchar, commit ascii, path varchar, " 77 | "PRIMARY KEY (sha1, repo, commit, path))" % tables["meta"]) 78 | else: 79 | cql("DROP TABLE IF EXISTS %s" % tables["hashes"]) 80 | cql("DROP TABLE IF EXISTS %s" % tables["hashtables"]) 81 | cql("DROP TABLE IF EXISTS %s" % tables["hashtables2"]) 82 | cql("CREATE TABLE %s (sha1 varchar, value blob, PRIMARY KEY (sha1))" % tables["hashes"]) 83 | cql("CREATE TABLE %s (sha1 varchar, hashtable tinyint, value blob, " 84 | "PRIMARY KEY (hashtable, value, sha1))" % tables["hashtables"]) 85 | cql("CREATE TABLE %s (sha1 varchar, hashtable tinyint, value blob, " 86 | "PRIMARY KEY (sha1, hashtable))" % tables["hashtables2"]) 87 | 88 | 89 | class BatchedHashResolver: 90 | def __init__(self, hashes: Iterable, batch_size: int, session: Session, table: str): 91 | self.hashes = iter(hashes) 92 | self.batch_size = batch_size 93 | self.session = session 94 | self.table = table 95 | self.buffer = [] 96 | self._log = logging.getLogger("BatchedHashResolver") 97 | 98 | def __next__(self): 99 | while True: 100 | if not self.buffer: 101 | self._pump() 102 | r = None 103 | while r is None and self.buffer: 104 | r = self.buffer.pop() 105 | if r is not None: 106 | return r 107 | 108 | def __iter__(self): 109 | return self 110 | 111 | def _pump(self): 112 | first_hash = next(self.hashes) 113 | try: 114 | fh, fm = first_hash 115 | items = {h: (i, m) for i, (h, m) in zip(range(1, self.batch_size), self.hashes)} 116 | items[fh] = 0, fm 117 | meta = True 118 | except ValueError: 119 | items = {h: i for i, h in zip(range(1, self.batch_size), self.hashes)} 120 | items[first_hash] = 0 121 | meta = False 122 | if not items: 123 | raise StopIteration() 124 | query = "select sha1, repo, commit, path from %s where sha1 in (%s)" % ( 125 | self.table, ",".join("'%s'" % h for h in items)) 126 | self._log.debug("%s in (%d)", query[:query.find(" in (")], len(items)) 127 | rows = self.session.execute(query) 128 | buffer = self.buffer 129 | buffer.extend(None for _ in items) 130 | l = len(items) # noqa 131 | count = 0 132 | for r in rows: 133 | count += 1 134 | if meta: 135 | i, m = items[r.sha1] 136 | else: 137 | i = items[r.sha1] 138 | m = None 139 | # reverse order - we will pop() in __next__ 140 | tr = r.sha1, (r.repo, r.commit, r.path) 141 | buffer[l - i - 1] = (tr + (m,)) if meta else tr 142 | self._log.debug("-> %d", count) 143 | 144 | 145 | class ColorFormatter(logging.Formatter): 146 | """ 147 | logging Formatter which prints messages with colors. 148 | """ 149 | GREEN_MARKERS = [" ok", "ok:", "finished", "completed", "ready", 150 | "done", "running", "success", "saved"] 151 | GREEN_RE = re.compile("|".join(GREEN_MARKERS)) 152 | BEER_MUG = platform.uname().release.endswith("-moby") 153 | FUR_TREE = datetime.now().month == 12 and datetime.now().day >= 8 154 | 155 | def formatMessage(self, record): 156 | level_color = "0" 157 | text_color = "0" 158 | fmt = "" 159 | if record.levelno <= logging.DEBUG: 160 | fmt = "\033[0;37m" + logging.BASIC_FORMAT + "\033[0m" 161 | elif record.levelno <= logging.INFO: 162 | level_color = "1;36" 163 | lmsg = record.message.lower() 164 | if self.GREEN_RE.search(lmsg): 165 | text_color = "1;32" 166 | elif record.levelno <= logging.WARNING: 167 | level_color = "1;33" 168 | elif record.levelno <= logging.CRITICAL: 169 | level_color = "1;31" 170 | if self.BEER_MUG: 171 | spice = "🍺 " 172 | elif self.FUR_TREE: 173 | spice = "🎄 " 174 | else: 175 | spice = "" 176 | if not fmt: 177 | fmt = "\033[" + level_color + \ 178 | "m" + spice + "%(levelname)s\033[0m:%(name)s:\033[" + text_color + \ 179 | "m%(message)s\033[0m" 180 | return fmt % record.__dict__ 181 | 182 | 183 | modelforge.logs.ColorFormatter = ColorFormatter 184 | -------------------------------------------------------------------------------- /apollo/graph.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import chain 3 | import logging 4 | import os 5 | import sys 6 | from uuid import uuid4 7 | 8 | from igraph import Graph 9 | from modelforge import Model, merge_strings, split_strings, assemble_sparse_matrix, \ 10 | disassemble_sparse_matrix, register_model 11 | from modelforge.progress_bar import progress_bar 12 | import numpy 13 | from pyspark.sql.types import Row 14 | from scipy.sparse import csr_matrix 15 | from sourced.ml.utils import create_spark 16 | from sourced.ml.extractors.helpers import filter_kwargs 17 | 18 | from apollo.cassandra_utils import get_db, configure, BatchedHashResolver, Session 19 | from apollo.query import weighted_jaccard, stream_template 20 | 21 | 22 | @register_model 23 | class ConnectedComponentsModel(Model): 24 | """ 25 | Model to store the connected components. 26 | """ 27 | NAME = "connected_components" 28 | 29 | def construct(self, connected_components, element_to_buckets, element_to_id): 30 | self.id_to_cc = numpy.zeros(len(element_to_id), dtype=numpy.uint32) 31 | for cc, ids in connected_components.items(): 32 | for id_ in ids: 33 | self.id_to_cc[id_] = cc 34 | self.id_to_element = [None] * len(element_to_id) 35 | for k, v in element_to_id.items(): 36 | self.id_to_element[v] = k 37 | data = numpy.ones(sum(map(len, element_to_buckets)), dtype=numpy.uint8) 38 | indices = numpy.zeros(len(data), dtype=numpy.uint32) 39 | indptr = numpy.zeros(len(element_to_buckets) + 1, dtype=numpy.uint32) 40 | pos = 0 41 | for i, element in enumerate(element_to_buckets): 42 | indices[pos:(pos + len(element))] = element 43 | pos += len(element) 44 | indptr[i + 1] = pos 45 | self.id_to_buckets = csr_matrix((data, indices, indptr)) 46 | return self 47 | 48 | def _load_tree(self, tree): 49 | self.id_to_cc = tree["cc"] 50 | self.id_to_cc[0] # do not remove - loads the array from disk 51 | self.id_to_element = split_strings(tree["elements"]) 52 | self.id_to_buckets = assemble_sparse_matrix(tree["buckets"]) 53 | 54 | def dump(self): 55 | return "Number of connected components: %s\nNumber of unique elements: %s" % ( 56 | len(numpy.unique(self.id_to_cc)), len(self.id_to_element)) 57 | 58 | def _generate_tree(self): 59 | return {"cc": self.id_to_cc, "elements": merge_strings(self.id_to_element), 60 | "buckets": disassemble_sparse_matrix(self.id_to_buckets)} 61 | 62 | 63 | def _find_connected_component(buckets, element_to_buckets): 64 | """ 65 | Find connected components among buckets. 66 | :param buckets: list of buckets where each bucket contains list of elements 67 | :param element_to_buckets: mapping from element to list of buckets where it appears 68 | :return: mapping from connected component to set of elements in it 69 | """ 70 | unvisited_buckets = set(range(len(buckets))) 71 | connected_components_element = defaultdict(set) 72 | 73 | cc_id = 0 # connected component counter 74 | while unvisited_buckets: 75 | pending = {unvisited_buckets.pop()} 76 | while pending: 77 | bucket = pending.pop() 78 | elements = buckets[bucket] 79 | connected_components_element[cc_id].update(elements) 80 | for element in elements: 81 | element_buckets = element_to_buckets[element] 82 | for b in element_buckets: 83 | if b in unvisited_buckets: 84 | pending.add(b) 85 | unvisited_buckets.remove(b) 86 | # increase number of connected components 87 | cc_id += 1 88 | return connected_components_element 89 | 90 | 91 | def find_connected_components(args): 92 | log = logging.getLogger("graph") 93 | session = get_db(args) 94 | table = args.tables["hashtables"] 95 | rows = session.execute("SELECT DISTINCT hashtable FROM %s" % table) 96 | hashtables = sorted(r.hashtable for r in rows) 97 | log.info("Detected %d hashtables", len(hashtables)) 98 | 99 | # Read buckets from database 100 | buckets = [] 101 | element_ids = {} 102 | prev_len = 0 103 | for hashtable in hashtables: 104 | rows = session.execute( 105 | "SELECT sha1, value FROM %s WHERE hashtable=%d" % (table, hashtable)) 106 | band = None 107 | bucket = [] 108 | for row in rows: 109 | eid = element_ids.setdefault(row.sha1, len(element_ids)) 110 | if row.value != band: 111 | if band is not None: 112 | buckets.append(bucket.copy()) 113 | bucket.clear() 114 | band = row.value 115 | bucket.append(eid) 116 | continue 117 | bucket.append(eid) 118 | if bucket: 119 | buckets.append(bucket) 120 | log.info("Fetched %d, %d buckets", hashtable, len(buckets) - prev_len) 121 | prev_len = len(buckets) 122 | 123 | element_to_buckets = [[] for _ in range(len(element_ids))] 124 | for i, bucket in enumerate(buckets): 125 | for element in bucket: 126 | element_to_buckets[element].append(i) 127 | 128 | # Statistics about buckets 129 | levels = (logging.ERROR, logging.INFO) 130 | log.info("Number of buckets: %d", len(buckets)) 131 | log.log(levels[len(element_ids) >= len(buckets[0])], 132 | "Number of elements: %d", len(element_ids)) 133 | epb = sum(map(len, buckets)) / len(buckets) 134 | log.log(levels[epb >= 1], "Average number of elements per bucket: %.1f", epb) 135 | nb = min(map(len, element_to_buckets)) 136 | log.log(levels[nb == len(hashtables)], "Min number of buckets per element: %s", nb) 137 | nb = max(map(len, element_to_buckets)) 138 | log.log(levels[nb == len(hashtables)], "Max number of buckets per element: %s", nb) 139 | log.info("Running CC analysis") 140 | 141 | # Connect components 142 | connected_components_element = _find_connected_component(buckets, element_to_buckets) 143 | log.info("CC number: %d", len(connected_components_element)) 144 | 145 | log.info("Writing %s", args.output) 146 | ConnectedComponentsModel() \ 147 | .construct(connected_components_element, element_to_buckets, element_ids) \ 148 | .save(args.output) 149 | 150 | 151 | def dumpcc(args): 152 | model = ConnectedComponentsModel().load(args.input) 153 | ccs = defaultdict(list) 154 | for i, cc in enumerate(model.id_to_cc): 155 | ccs[cc].append(i) 156 | for _, cc in sorted(ccs.items()): 157 | print(" ".join(model.id_to_element[i] for i in cc)) 158 | 159 | 160 | @register_model 161 | class CommunitiesModel(Model): 162 | """ 163 | Model to store the node communities. 164 | """ 165 | NAME = "communities" 166 | 167 | def construct(self, communities, id_to_element): 168 | self.communities = communities 169 | self.id_to_element = id_to_element 170 | return self 171 | 172 | def _load_tree(self, tree): 173 | self.id_to_element = split_strings(tree["elements"]) 174 | data, indptr = tree["data"], tree["indptr"] 175 | self.communities = [data[i:j] for i, j in zip(indptr, indptr[1:])] 176 | 177 | def _generate_tree(self): 178 | size = sum(map(len, self.communities)) 179 | data = numpy.zeros(size, dtype=numpy.uint32) 180 | indptr = numpy.zeros(len(self.communities) + 1, dtype=numpy.int64) 181 | pos = 0 182 | for i, community in enumerate(self.communities): 183 | data[pos:pos + len(community)] = community 184 | pos += len(community) 185 | indptr[i + 1] = pos 186 | return {"data": data, "indptr": indptr, "elements": merge_strings(self.id_to_element)} 187 | 188 | def dump(self): 189 | return "Number of communities: %s" % (len(self.communities)) 190 | 191 | def count_elements(self): 192 | return sum(sum(1 for i in c if i < len(self.id_to_element)) for c in self.communities) 193 | 194 | 195 | def detect_communities(args): 196 | log = logging.getLogger("cmd") 197 | ccsmodel = ConnectedComponentsModel().load(args.input) 198 | log.info("Building the connected components") 199 | ccs = defaultdict(list) 200 | for i, c in enumerate(ccsmodel.id_to_cc): 201 | ccs[c].append(i) 202 | buckmat = ccsmodel.id_to_buckets 203 | buckindices = buckmat.indices 204 | buckindptr = buckmat.indptr 205 | total_nvertices = buckmat.shape[0] 206 | linear = args.edges in ("linear", "1") 207 | graphs = [] 208 | communities = [] 209 | if not linear: 210 | log.info("Transposing the matrix") 211 | buckmat_csc = buckmat.T.tocsr() 212 | fat_ccs = [] 213 | for vertices in ccs.values(): 214 | if len(vertices) == 1: 215 | continue 216 | if len(vertices) == 2: 217 | communities.append(vertices) 218 | continue 219 | fat_ccs.append(vertices) 220 | log.info("Building %d graphs", len(fat_ccs)) 221 | for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)): 222 | if linear: 223 | edges = [] 224 | weights = [] 225 | bucket_weights = buckmat.sum(axis=0) 226 | buckets = set() 227 | for i in vertices: 228 | for j in range(buckindptr[i], buckindptr[i + 1]): 229 | bucket = buckindices[j] 230 | weights.append(bucket_weights[0, bucket]) 231 | bucket += total_nvertices 232 | buckets.add(bucket) 233 | edges.append((str(i), str(bucket))) 234 | else: 235 | edges = set() 236 | weights = None 237 | buckets = set() 238 | for i in vertices: 239 | for j in range(buckindptr[i], buckindptr[i + 1]): 240 | buckets.add(buckindices[j]) 241 | for bucket in buckets: 242 | buckverts = \ 243 | buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]] 244 | for i, x in enumerate(buckverts): 245 | for y in buckverts: 246 | if x < y: 247 | edges.add((str(x), str(y))) 248 | buckets.clear() 249 | edges = list(edges) 250 | graph = Graph(directed=False) 251 | graph.add_vertices(list(map(str, vertices + list(buckets)))) 252 | graph.add_edges(edges) 253 | graph.edge_weights = weights 254 | graphs.append(graph) 255 | log.info("Launching the community detection") 256 | detector = CommunityDetector(algorithm=args.algorithm, config=args.params) 257 | if not args.no_spark: 258 | spark = create_spark( 259 | "cmd-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark)).sparkContext 260 | communities.extend(spark.parallelize(graphs).flatMap(detector).collect()) 261 | else: 262 | communities.extend(chain.from_iterable(progress_bar( 263 | (detector(g) for g in graphs), log, expected_size=len(graphs)))) 264 | log.info("Overall communities: %d", len(communities)) 265 | log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities])) 266 | log.info("Median community size: %.1f", numpy.median([len(c) for c in communities])) 267 | log.info("Max community size: %d", max(map(len, communities))) 268 | log.info("Writing %s", args.output) 269 | CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output) 270 | 271 | 272 | class CommunityDetector: 273 | def __init__(self, algorithm, config): 274 | self.algorithm = algorithm 275 | self.config = config 276 | 277 | def __call__(self, graph): 278 | action = getattr(graph, "community_" + self.algorithm) 279 | if self.algorithm == "infomap": 280 | kwargs = {"edge_weights": graph.edge_weights} 281 | elif self.algorithm == "leading_eigenvector_naive": 282 | kwargs = {} 283 | else: 284 | kwargs = {"weights": graph.edge_weights} 285 | if self.algorithm == "edge_betweenness": 286 | kwargs["directed"] = False 287 | # TODO: Rollback to action(**kwargs, **self.config) when support for Python3.4 is over 288 | kwargs.update(self.config) 289 | result = action(**kwargs) 290 | if hasattr(result, "as_clustering"): 291 | result = result.as_clustering() 292 | 293 | output = [[] for _ in range(len(result.sizes()))] 294 | for i, memb in enumerate(result.membership): 295 | output[memb].append(int(graph.vs[i]["name"])) 296 | 297 | return output 298 | 299 | 300 | class BatchedCommunityResolver: 301 | def __init__(self, model: CommunitiesModel, batch_size: int, session: Session, table: str): 302 | self._log = logging.getLogger("BatchedCommunityResolver") 303 | self.resolver = progress_bar( 304 | BatchedHashResolver(self._gen_hashes(model), batch_size, session, table), 305 | self._log, expected_size=model.count_elements() 306 | ) 307 | self._prev = None, None, None 308 | 309 | def __next__(self): 310 | pci = self._prev[-1] 311 | com = [self._prev[:-1]] if pci is not None else [] 312 | for sha1, info, ci in self.resolver: 313 | if pci is None: 314 | pci = ci 315 | if pci == ci: 316 | com.append((sha1, info)) 317 | else: 318 | self._prev = sha1, info, ci 319 | if len(com) > 1: 320 | return com 321 | if com and pci is not None: 322 | self._prev = None, None, None 323 | if len(com) > 1: 324 | return com 325 | raise StopIteration() 326 | 327 | def __iter__(self): 328 | return self 329 | 330 | def _gen_hashes(self, model): 331 | id_to_element = model.id_to_element 332 | for i, community in enumerate(model.communities): 333 | for j in community: 334 | try: 335 | yield id_to_element[j].split("@")[1], i 336 | except IndexError: 337 | continue 338 | 339 | 340 | def dumpcmd(args): 341 | log = logging.getLogger("dumpcmd") 342 | model = CommunitiesModel().load(args.input) 343 | log.info("Initializing the sha1 resolver") 344 | communities = BatchedCommunityResolver(model, args.batch, get_db(args), args.tables["meta"]) 345 | stream_template(args.template, sys.stdout, communities=communities, model=model, 346 | model_path=os.path.abspath(args.input)) 347 | 348 | 349 | class CommunityEvaluator: 350 | def __init__(self, threshold, vocabulary_size): 351 | self.threshold = threshold 352 | self.vocabulary_size = vocabulary_size 353 | 354 | def __call__(self, community): 355 | cid, contents = community 356 | elements = defaultdict(list) 357 | for t in contents: 358 | elements[t[0]].append(t[1:]) 359 | if len(elements) == 1: 360 | return (0,) * 4 361 | for key, vals in elements.items(): 362 | vec = numpy.zeros(self.vocabulary_size, dtype=numpy.float32) 363 | for i, w in vals: 364 | vec[i] = w 365 | elements[key] = vec 366 | misses = 0 367 | loss = 0 368 | for x, e1 in elements.items(): 369 | for y, e2 in elements.items(): 370 | if x >= y: 371 | continue 372 | sim = weighted_jaccard(e1, e2) 373 | if sim < self.threshold: 374 | loss += (sim - self.threshold) ** 2 375 | misses += 1 376 | count = len(elements) * (len(elements) - 1) / 2 377 | return misses, misses / count, loss, loss / count 378 | 379 | 380 | def evaluate_communities(args): 381 | log = logging.getLogger("evalcc") 382 | model = CommunitiesModel().load(args.input) 383 | configure(args) 384 | spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark)) 385 | log.info("Preparing the communities' RDD") 386 | items = [] 387 | for i, c in progress_bar(enumerate(model.communities), log, 388 | expected_size=len(model.communities)): 389 | for m in c: 390 | if m < len(model.id_to_element): 391 | items.append(Row(sha1=model.id_to_element[m], community=i)) 392 | log.info("Running") 393 | items_in_spark = spark.sparkContext.parallelize(items).toDF() 394 | bags = spark \ 395 | .read \ 396 | .format("org.apache.spark.sql.cassandra") \ 397 | .options(table=args.tables["bags"], keyspace=args.keyspace) \ 398 | .load() 399 | log.info("Loaded the bags, calculating the vocabulary") 400 | vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect() 401 | vocabulary = {v: i for i, v in enumerate(vocabulary)} 402 | log.info("Vocabulary size: %d", len(vocabulary)) 403 | element_to_id = {e: i for i, e in enumerate(model.id_to_element)} 404 | metrics = items_in_spark.join(bags, "sha1").rdd \ 405 | .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \ 406 | .groupByKey() \ 407 | .map(CommunityEvaluator(args.threshold, len(vocabulary))) \ 408 | .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)]) 409 | log.info("Total misses: %d", metrics[0]) 410 | log.info("Average normalized misses: %f", metrics[1] / len(model.communities)) 411 | log.info("Total loss: %f", metrics[2]) 412 | log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities))) 413 | -------------------------------------------------------------------------------- /apollo/hasher.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from uuid import uuid4 4 | 5 | from bblfsh import BblfshClient 6 | from modelforge.model import Model 7 | from modelforge.models import register_model 8 | import numpy 9 | from pyspark.sql.types import Row 10 | from scipy.integrate import quad as integrate 11 | from sourced.ml.models import OrderedDocumentFrequencies 12 | from sourced.ml.utils import create_spark 13 | from sourced.ml.transformers.bow_writer import BOWLoader 14 | from sourced.ml.extractors import __extractors__ 15 | from sourced.ml.extractors.helpers import filter_kwargs 16 | from sourced.ml.algorithms import log_tf_log_idf 17 | 18 | from apollo import cassandra_utils 19 | 20 | ##################################################################################### 21 | # Begin code from https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py # 22 | ##################################################################################### 23 | 24 | 25 | def _false_positive_probability(threshold, b, r): 26 | def _probability(s): 27 | return 1 - (1 - s**float(r))**float(b) 28 | a, err = integrate(_probability, 0.0, threshold) 29 | return a 30 | 31 | 32 | def _false_negative_probability(threshold, b, r): 33 | def _probability(s): 34 | return 1 - (1 - (1 - s**float(r))**float(b)) 35 | a, err = integrate(_probability, threshold, 1.0) 36 | return a 37 | 38 | 39 | def calc_hashtable_params(threshold, sample_size, false_positive_weight=0.5, 40 | false_negative_weight=0.5): 41 | """ 42 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum 43 | of probabilities of false positive and false negative. 44 | 45 | :return: tuple(number of hashtables, size of each band). 46 | """ 47 | min_error = float("inf") 48 | opt = (0, 0) 49 | for b in range(1, sample_size + 1): 50 | max_r = int(sample_size / b) 51 | for r in range(1, max_r+1): 52 | fp = _false_positive_probability(threshold, b, r) 53 | fn = _false_negative_probability(threshold, b, r) 54 | error = fp*false_positive_weight + fn*false_negative_weight 55 | if error < min_error: 56 | min_error = error 57 | opt = (b, r) 58 | return opt 59 | 60 | 61 | ##################################################################################### 62 | # End code from https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py # 63 | ##################################################################################### 64 | 65 | 66 | @register_model 67 | class WeightedMinHashParameters(Model): 68 | """ 69 | The randomly generated parameters of the Weighted MinHash-er. 70 | """ 71 | NAME = "wmhparams" 72 | 73 | def construct(self, rs, ln_cs, betas): 74 | self.rs = rs 75 | self.ln_cs = ln_cs 76 | self.betas = betas 77 | rs[0] + ln_cs[0] + betas[0] # do not remove - this loads the arrays from disk 78 | return self 79 | 80 | def _load_tree(self, tree): 81 | self.construct(rs=tree["rs"], ln_cs=tree["ln_cs"], betas=tree["betas"]) 82 | 83 | def dump(self): 84 | return """Shape: %s""" % (self.rs.shape,) 85 | 86 | def _generate_tree(self): 87 | return {"rs": self.rs, "ln_cs": self.ln_cs, "betas": self.betas} 88 | 89 | 90 | class HashExploder: 91 | def __init__(self, htnum, band_size): 92 | self.htnum = htnum 93 | self.band_size = band_size 94 | 95 | def __call__(self, record): 96 | key, wmh = record 97 | for hti in range(self.htnum): 98 | yield Row(sha1=key, hashtable=hti, 99 | value=bytearray(wmh[hti * self.band_size:(hti + 1) * self.band_size].data)) 100 | 101 | 102 | def modify_feature_weights(batches, arguments, **kwargs): 103 | extractors = {} 104 | for ex in __extractors__.values(): 105 | if "%s_weight" % ex.NAME in dir(arguments) and \ 106 | getattr(arguments, "%s_weight" % ex.NAME) != 1: 107 | extractors[ex.NAME] = (ex.NAMESPACE, getattr(arguments, "%s_weight" % ex.NAME)) 108 | 109 | if not extractors: 110 | return batches 111 | 112 | err = "You must specify location of docfreq file to modify weights of features" 113 | assert arguments.docfreq is not None, err 114 | assert os.path.isfile(arguments.docfreq), "docfreq should be a file" 115 | 116 | model = OrderedDocumentFrequencies().load(arguments.docfreq) 117 | feature_mapping = model.order 118 | 119 | voc_size = batches[0].matrix.shape[-1] 120 | weights = numpy.ones((voc_size,)) 121 | 122 | for ext in extractors: 123 | namespace = extractors[ext][0] 124 | ind = [feature_mapping[k] for k in feature_mapping if k.startswith(namespace)] 125 | weights[ind] = extractors[ext][1] 126 | 127 | for batch in batches: 128 | # hack to modify attribute in namedtuple 129 | batch.matrix.data = batch.matrix.multiply(weights).tocsr().data.astype(numpy.float32) 130 | 131 | return batches 132 | 133 | 134 | def hash_batches(args): 135 | log = logging.getLogger("hash") 136 | log.info("Loading files from %s", args.input) 137 | loader = BOWLoader(args.input) 138 | log.info("%d batches", len(loader)) 139 | 140 | # Check batches 141 | if not loader: 142 | return 143 | 144 | htnum, band_size = calc_hashtable_params( 145 | args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) 146 | log.info("Number of hash tables: %d", htnum) 147 | log.info("Band size: %d", band_size) 148 | cassandra_utils.configure(args) 149 | spark_args = filter_kwargs(args.__dict__, create_spark) 150 | spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext 151 | import libMHCUDA # delayed import which requires CUDA and friends 152 | tables = args.tables 153 | gen = voc_size = None 154 | try: 155 | for i, bow in enumerate(loader): 156 | if voc_size is None: 157 | voc_size = bow.matrix.shape[-1] 158 | log.info("Initializing the generator") 159 | deferred = os.path.isfile(args.params) 160 | gen = libMHCUDA.minhash_cuda_init( 161 | voc_size, args.size, seed=args.seed, devices=args.devices, 162 | verbosity=args.mhc_verbosity, 163 | deferred=deferred) 164 | if deferred: 165 | model = WeightedMinHashParameters().load(args.params) 166 | libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) 167 | else: 168 | log.info("Writing %s", args.params) 169 | params = libMHCUDA.minhash_cuda_retrieve_vars(gen) 170 | WeightedMinHashParameters().construct(*params).save(args.params) 171 | if bow.matrix.shape[-1] != voc_size: 172 | raise ValueError("The vocabulary sizes do not match: %d != %d" 173 | % (bow.matrix.shape[-1], voc_size)) 174 | log.info("Processing batch %d / %d", i + 1, len(loader)) 175 | # Modify features if needed 176 | # TODO(vmarkovtsev): port to the new structure 177 | # batches = modify_feature_weights(batches, args) 178 | hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) 179 | job = [(k, h) for k, h in zip(bow.documents, hashes)] 180 | log.info("Saving the hashtables") 181 | df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)) \ 182 | .coalesce(args.partitions, args.shuffle) \ 183 | .toDF() 184 | df.write \ 185 | .format("org.apache.spark.sql.cassandra") \ 186 | .mode("append") \ 187 | .options(table=tables["hashtables"], keyspace=args.keyspace) \ 188 | .save() 189 | df.write \ 190 | .format("org.apache.spark.sql.cassandra") \ 191 | .mode("append") \ 192 | .options(table=tables["hashtables2"], keyspace=args.keyspace) \ 193 | .save() 194 | log.info("Saving the hashes") 195 | spark.parallelize(job) \ 196 | .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ 197 | .coalesce(args.partitions, args.shuffle) \ 198 | .toDF() \ 199 | .write \ 200 | .format("org.apache.spark.sql.cassandra") \ 201 | .mode("append") \ 202 | .options(table=tables["hashes"], keyspace=args.keyspace) \ 203 | .save() 204 | finally: 205 | libMHCUDA.minhash_cuda_fini(gen) 206 | 207 | 208 | def hash_file(args): 209 | if not args.feature: 210 | raise ValueError("extractors must not be empty") 211 | log = logging.getLogger("hash_file") 212 | vocab = OrderedDocumentFrequencies().load(args.docfreq) 213 | params = WeightedMinHashParameters().load(args.params) 214 | log.info("Extracting UAST from %s", args.file) 215 | uast = BblfshClient(args.bblfsh).parse(args.file).uast 216 | log.info("Populating the bag") 217 | extractors = [__extractors__[s]( 218 | args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) 219 | for s in args.feature] 220 | bag = numpy.zeros(len(vocab), dtype=numpy.float32) 221 | for ex in extractors: 222 | ex.ndocs = vocab.docs 223 | ex.docfreq = vocab 224 | for k, v in ex.extract(uast): 225 | try: 226 | i = vocab.order[k] 227 | bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs) 228 | except KeyError: 229 | continue 230 | 231 | log.info("Bag size: %d", len(bag.nonzero()[0])) 232 | log.info("Hashing") 233 | 234 | return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag 235 | 236 | 237 | def weighted_minhash(v, sample_size, rs, ln_cs, betas): 238 | if sample_size != rs.shape[0]: 239 | raise ValueError("Input sample size mismatch, expecting %d" % rs.shape[0]) 240 | if len(v) != rs.shape[1]: 241 | raise ValueError("Input dimension mismatch, expecting %d" % rs.shape[1]) 242 | 243 | hashvalues = numpy.zeros((sample_size, 2), dtype=numpy.uint32) 244 | vzeros = (v == 0) 245 | if vzeros.all(): 246 | raise ValueError("Input is all zeros") 247 | v[vzeros] = numpy.nan 248 | vlog = numpy.log(v) 249 | v[vzeros] = 0 250 | for i in range(sample_size): 251 | t = numpy.floor((vlog / rs[i]) + betas[i]) 252 | ln_y = (t - betas[i]) * rs[i] 253 | ln_a = ln_cs[i] - ln_y - rs[i] 254 | k = numpy.nanargmin(ln_a) 255 | hashvalues[i][0], hashvalues[i][1] = k, int(t[k]) 256 | return hashvalues 257 | -------------------------------------------------------------------------------- /apollo/query.md.jinja2: -------------------------------------------------------------------------------- 1 | # Similar to {{ origin }} 2 | 3 | Size: {{ size }} 4 | 5 | | SHA1 | Repository | Commit | Path | 6 | |:----:|:-----------|:-------|:-----| 7 | {% for sha1, (repo, commit, path) in items | sort %} 8 | | `{{ sha1 }}` | [{{ repo.rsplit(".git")[0] }}](https://{{ repo.rsplit(".git")[0] }}) | `{{ commit[:8] }}` | [{{ path }}]({{ format_url(repo, commit, path) }}) | 9 | {% endfor %} 10 | -------------------------------------------------------------------------------- /apollo/query.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import logging 3 | import os 4 | import sys 5 | 6 | import jinja2 7 | import numpy 8 | from sourced.ml.models import OrderedDocumentFrequencies 9 | 10 | from apollo.cassandra_utils import get_db, BatchedHashResolver 11 | from apollo.hasher import hash_file, calc_hashtable_params 12 | 13 | 14 | def query(args): 15 | log = logging.getLogger("query") 16 | session = get_db(args) 17 | tables = args.tables 18 | if args.id: 19 | rows = session.execute( 20 | "SELECT hashtable, value FROM %s WHERE sha1='%s'" % (tables["hashtables2"], args.id)) 21 | bands = [(r.hashtable, r.value) for r in rows] 22 | else: 23 | # args.file 24 | if not args.feature: 25 | log.critical("-f / --feature must be specified at least once in file query mode") 26 | return 1 27 | if not args.params: 28 | log.critical("-p / --params must be specified in file query mode") 29 | return 1 30 | wmh, bag = hash_file(args) 31 | htnum, band_size = calc_hashtable_params( 32 | args.threshold, len(wmh), args.false_positive_weight, args.false_negative_weight) 33 | log.info("Number of hash tables: %d", htnum) 34 | log.info("Band size: %d", band_size) 35 | bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data)) 36 | for i in range(htnum)] 37 | similar = set() 38 | log.info("Looking for similar items") 39 | for i, band in bands: 40 | rows = session.execute( 41 | "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s" 42 | % (tables["hashtables"], i, codecs.encode(band, "hex").decode())) 43 | similar.update(r.sha1 for r in rows) 44 | log.info("Fetched %d items", len(similar)) 45 | if args.precise: 46 | # Precise bags 47 | vocab = OrderedDocumentFrequencies().load(args.docfreq) 48 | log.info("Calculating the precise result") 49 | if args.id: 50 | rows = session.execute( 51 | "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], args.id)) 52 | bag = numpy.zeros(len(vocab), dtype=numpy.float32) 53 | for row in rows: 54 | bag[vocab.order[row.item]] = row.value 55 | # Fetch other bags from the DB 56 | precise = [] 57 | for x in similar: 58 | rows = session.execute( 59 | "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], x)) 60 | other_bag = numpy.zeros(len(vocab), dtype=numpy.float32) 61 | for row in rows: 62 | other_bag[vocab.order[row.item]] = row.value 63 | if weighted_jaccard(bag, other_bag) >= args.threshold: 64 | precise.append(x) 65 | log.info("Survived: %.2f", len(precise) / len(similar)) 66 | similar = precise 67 | if args.id: 68 | try: 69 | similar.remove(args.id) 70 | except KeyError: 71 | # o_O 72 | pass 73 | 74 | similar = [s.split("@")[1] for s in similar] 75 | stream_template(args.template, sys.stdout, size=len(similar), 76 | origin=args.id if args.id else os.path.abspath(args.file), 77 | items=BatchedHashResolver(similar, args.batch, session, tables["meta"])) 78 | 79 | 80 | def weighted_jaccard(vec1, vec2): 81 | return numpy.minimum(vec1, vec2).sum() / numpy.maximum(vec1, vec2).sum() 82 | 83 | 84 | def format_url(repo, commit, path): 85 | if repo.endswith(".git"): 86 | repo = repo[:-4] 87 | if repo.startswith("github.com") or repo.startswith("gitlab.com"): 88 | return "https://%s/blob/%s/%s" % (repo, commit, path) 89 | if repo.startswith("bitbucket.org"): 90 | return "https://%s/src/%s/%s" % (repo, commit, path) 91 | return "[%s %s %s]" % (repo, commit, path) 92 | 93 | 94 | def stream_template(name, dest, **kwargs): 95 | log = logging.getLogger("jinja2") 96 | log.info("Loading the template") 97 | loader = jinja2.FileSystemLoader(("/", os.path.dirname(__file__), os.getcwd()), 98 | followlinks=True) 99 | env = jinja2.Environment( 100 | trim_blocks=True, 101 | lstrip_blocks=True, 102 | keep_trailing_newline=False, 103 | ) 104 | template = loader.load(env, name) 105 | log.info("Rendering") 106 | template.stream(format_url=format_url, **kwargs).dump(dest) 107 | -------------------------------------------------------------------------------- /apollo/report.md.jinja2: -------------------------------------------------------------------------------- 1 | # Code similarity report 2 | 3 | ### Model 4 | Path: `{{ model_path }}` 5 | 6 | Items: {{ model.id_to_element | length }} 7 | 8 | Cardinality: {{ model.communities | length }} 9 | 10 | ### Communities 11 | {% for com in communities %} 12 | 13 | | SHA1 | Repository | Commit | Path | 14 | |:----:|:-----------|:-------|:-----| 15 | {% for sha1, (repo, commit, path) in com | sort %} 16 | | `{{ sha1 }}` | [{{ repo.rsplit(".git")[0] }}](https://{{ repo.rsplit(".git")[0] }}) | `{{ commit[:8] }}` | [{{ path }}]({{ format_url(repo, commit, path) }}) | 17 | {% endfor %} 18 | {% endfor %} 19 | -------------------------------------------------------------------------------- /apollo/warmup.py: -------------------------------------------------------------------------------- 1 | from sourced.ml.extractors.helpers import filter_kwargs 2 | from sourced.ml.utils import create_engine 3 | 4 | 5 | def warmup(args): 6 | engine_args = filter_kwargs(args.__dict__, create_engine) 7 | create_engine("warmup", "/tmp", **engine_args) 8 | -------------------------------------------------------------------------------- /doc/101.md: -------------------------------------------------------------------------------- 1 | # Brief guide into finding similar source code with Apollo 2 | 3 | ### Environment 4 | 5 | [Babelfish must be running and have Java driver installed.](https://doc.bblf.sh/user/getting-started.html) 6 | Cassandra or ScyllaDB must be running. 7 | 8 | ### Prepare the source code 9 | 10 | Apollo works with Git repositories stored in [Siva](https://github.com/src-d/go-siva) format. 11 | Refer to [Borges](https://github.com/src-d/borges). We expect that the files will be in `/data` below. 12 | 13 | ### Extract the features 14 | 15 | We convert every file into a [weighted set of features](https://en.wikipedia.org/wiki/Bag-of-words_model). 16 | The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB) and 17 | the calculated global feature value frequencies are written to`./docfreq.asdf`. We use three 18 | extractors: literals, identifiers and deterministic AST subpaths of size 4. We double the importance 19 | of the latter features and throw away any values which appear in less than 4 files. Only Java source 20 | code is analysed. We optimize the pipeline executing by using the disk cache to save 21 | the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html) between each pass. The extracted bags 22 | are additionally saved in the database. 23 | 24 | ``` 25 | apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf \ 26 | -f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \ 27 | -l Java Python --persist DISK_ONLY 28 | ``` 29 | 30 | > Docker users should add `--bblfsh bblfshd --cassandra cassandra`. 31 | 32 | More about [`bags`](cmd/bags.md). 33 | 34 | ### Hash the samples 35 | 36 | We hash the files which were converted into bags in the previous step and stored as several 37 | `./bow*.asdf` files. The hashing parameters are written to `./params.asdf`. 38 | The Weighted Jaccard Similarity threshold, which is the closer to 1 the less files are considered 39 | similar, equals `0.8`. The hashtables are written to the database. 40 | 41 | ``` 42 | apollo hash 'bow*.asdf' -p params.asdf -t 0.8 43 | ``` 44 | 45 | > This step requires an NVIDIA GPU. 46 | 47 | > Docker users should add `--cassandra=cassandra`. 48 | 49 | More about [`hash`](cmd/hash.md). 50 | 51 | ### Query for a file 52 | 53 | Given a Git hash of a file in the dataset, list the similar files: 54 | ``` 55 | apollo query -i 56 | ``` 57 | 58 | > Docker users should add `--cassandra cassandra`. 59 | 60 | More about [`query`](cmd/hash.md). 61 | 62 | ### Find groups of similar files 63 | 64 | Find connected components in the resulting similarity graph and write them to `./cc.asdf`. 65 | 66 | ``` 67 | apollo cc -o cc.asdf 68 | ``` 69 | 70 | > Docker users should add `--cassandra cassandra`. 71 | 72 | Run the default community detection algorithm and write the clusters to `./communities.asdf`. 73 | 74 | ``` 75 | apollo cmd -i cc.asdf -o communities.asdf 76 | ``` 77 | 78 | > Docker users should add `--cassandra cassandra`. 79 | 80 | Output the report to stdout. 81 | 82 | ``` 83 | apollo dumpcmd communities.asdf 84 | ``` 85 | 86 | > Docker users should add `--cassandra cassandra`. 87 | 88 | More about: [`cc`](cmd/cc.md), [`cmd`](cmd/cmd.md), [`dumpcmd`](cmd/dumpcmd.md). 89 | -------------------------------------------------------------------------------- /doc/GLOSSARY.md: -------------------------------------------------------------------------------- 1 | ## Model 2 | A model is the artifact from running an analysis pipeline. 3 | It is plain data with some methods to access it. 4 | A model can be serialized to bytes and deserialized from bytes. 5 | The underlying storage format is specific to [src-d/modelforge](https://github.com/src-d/modelforge) 6 | and is currently [ASDF](https://github.com/spacetelescope/asdf) 7 | with [lz4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression. 8 | 9 | ## Pipeline 10 | A tree of linked `sourced.ml.transformers.Transformer` objects which can be executed on PySpark/source{d} engine. 11 | The result is often written on disk as [Parquet](https://parquet.apache.org/) or model files 12 | or to a database. 13 | 14 | ## Feature 15 | A property of the source code sample. 16 | 17 | ## Weighted MinHash 18 | An algorithm to approximate the [Weighted Jaccard Similarity](https://en.wikipedia.org/wiki/Jaccard_index#Generalized_Jaccard_similarity_and_distance) 19 | between all the pairs of source code samples in linear time and space. Described by 20 | [Sergey Ioffe](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf). -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Apollo is a research project started by [source{d}](https://sourced.tech) to find duplicated 4 | source code at scale. It is written in Python3 and relies on [source{d} engine](https://engine.sourced.tech) 5 | to process "big" source code. 6 | 7 | Big source code warehouses like GitHub inevitably contain much duplication. Snippets, files or even 8 | projects may have very few differences. Apollo allows to accurately mine those groups of similar 9 | items. Subsequently, a report may be generated to point at refactoring possibilities. 10 | While Apollo can applied at small scale, e.g. within a single project, it does not try to replace 11 | any existing tools in that niche. 12 | 13 | Behind the scenes, all source code samples are hashed with an algorithm which is 14 | [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing)-friendly: 15 | similar samples have similar, almost the same hashes. See [the detailed explanation](algorithm.md). 16 | The similarity is a subjective thing and depends on the human opinion. Apollo allows to combine 17 | various feature extractors and optimize their weights together with the overall threshold 18 | according to the reference dataset. The reference dataset is the only source of ground truth 19 | and should be manually labelled by a user, however, Apollo supplies the sane defaults. 20 | 21 | Apollo is a research project aimed at maximum hackability, flexibility and rapid improvements. 22 | [Gemini](https://github.com/src-d/gemini) is its brother for the actual production usage. 23 | [See more.](gemini.md) -------------------------------------------------------------------------------- /doc/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Apollo 2 | 3 | * [Introduction](README.md) 4 | * [Apollo vs. Gemini](gemini.md) 5 | * [Installation](install/README.md) 6 | * [Pip](install/pip.md) 7 | * [Docker](install/docker.md) 8 | * [Database initialization](install/db.md) 9 | * [Walkthrough](101.md) 10 | * Commands reference 11 | * [resetdb](cmd/resetdb.md) 12 | * [preprocess](cmd/preprocess.md) 13 | * [bags](cmd/bags.md) 14 | * [hash](cmd/hash.md) 15 | * [query](cmd/query.md) 16 | * [cc](cmd/cc.md) 17 | * [dumpcc](cmd/dumpcc.md) 18 | * [cmd](cmd/cmd.md) 19 | * [dumpcmd](cmd/dumpcmd.md) 20 | * [evalcc](cmd/evalcc.md) 21 | * Models reference 22 | * [Weighted MinHash parameters](model/wmh.md) 23 | * [Connected components](model/cc.md) 24 | * [Communities](model/cmd.md) 25 | -------------------------------------------------------------------------------- /doc/algorithm.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/apollo/fcdf67bb579681bbf978168e909cd74207ed06db/doc/algorithm.md -------------------------------------------------------------------------------- /doc/cmd/bags.md: -------------------------------------------------------------------------------- 1 | # Bags command 2 | 3 | This command converts input repositories to unordered weighted bags of features that are stored in DB, writes MinHashCuda batches, and writes the Ordered Documents Frequency model as well as the optional Quantization Levels model. You can specify the following arguments: 4 | 5 | - `-r`/`--repositories` : Path to the input files 6 | - `--parquet`: If your input files are Parquet files 7 | - `--graph`: Path to the output Graphviz file, if you wish to keep the tree 8 | - `-l`/`--languages` : Languages to keep, defaults to all languages detected by Babelfish 9 | - `--dzhigurda`: Index of the last commit to keep, defaults to 0 (only the head), 1 is HEAD~2, etc 10 | - `--bow`: Path to the output batches 11 | - `--batch`: The maximum size of a single batch in bytes 12 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1 13 | - `--docfreq-in`: Path to a precomputed Ordered Document Frequency model 14 | - `--docfreq-out`: Path to the output Ordered Document Frequency model (can not be used with `docfreq-in`) 15 | - `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million 16 | - `--cached-index-path`: Path to a precomputed Document Frequency model storing an index of the documents to be extracted 17 | - `--partitions`: to repartition data, this will specify new number of partitions 18 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 19 | - [Feature arguments](features.md) 20 | - [Spark and Engine arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) 21 | - [Cassandra/Scylla arguments](db.md) 22 | -------------------------------------------------------------------------------- /doc/cmd/cc.md: -------------------------------------------------------------------------------- 1 | # CC command 2 | 3 | This command runs the connected components analysis on previously created hash tables, 4 | and saves the CCs in [this `Model`](/doc/model/cc.md). You can specify the following arguments: 5 | 6 | - `o`/`--output`: Path to the output Connected Components model 7 | - [Cassandra/Scylla arguments](db.md) 8 | 9 | -------------------------------------------------------------------------------- /doc/cmd/cmd.md: -------------------------------------------------------------------------------- 1 | # CMD command 2 | 3 | __Currently does not work in Spark Cluster mode.__ 4 | 5 | This command runs the community detection on a previously created Connected Components 6 | model, and saves them in CCs in [this `Model`](/doc/model/cmd.md). You can specify 7 | the following arguments: 8 | 9 | - `-i`/`--input`: Path to the input Connected Components model 10 | - `-o`/`--output`: Path to the output Community Detection model 11 | - `--edges`: Specific method to be used to generate edges, quadratic will connect each item in each bucket to all other items, while the default, linear, will create for each bucket an artificial vertex to which all items will be connected. Depending on the method the edges will be created in quadratic or linear time, relatively to the number of buckets 12 | - `--no-spark`: If you do not want to use Spark - *but who would want that ?* 13 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) if you choose to use it 14 | - `-a`/`--algorithm`: Community detection algorithm to apply, defaults to `walktrap`, check out the [igraph](http://igraph.org/c/doc/igraph-Community.html) doc to learn more. See below for the full list of available algorithms and their parameters as of today (see code in `igraph/__init__.py` for description of parameters), we excluded parameters that modify the returned object 15 | - `-p`/`--params`: Depending on the algorithm, you may need to specify parameters (JSON format) 16 | 17 | | Algorithm | Parameters | 18 | |:----------:|:----------:| 19 | |community_fastgreedy|weights (for edges)| 20 | |community_infomap|edge_weights, vertex_weights, trials| 21 | |community_leading_eigenvector_naive|clusters| 22 | |community_leading_eigenvector|clusters, weights (for edges), arpack_options| 23 | |community_label_propagation|weights (for edges), initial, fixed| 24 | |community_multilevel|weights (for edges)| 25 | |community_optimal_modularity|weights (for edges)| 26 | |community_edge_betweenness|weights (for edges), clusters, directed| 27 | |community_spinglass|weights (for edges), spins, parupdate, start_temp, stop_temp, cool_fact, update_rule, gamma, _lambda, implementation| 28 | |community_walktrap|weights (for edges), steps| 29 | -------------------------------------------------------------------------------- /doc/cmd/db.md: -------------------------------------------------------------------------------- 1 | # Cassandra/Scylla arguments 2 | 3 | For all of the commands that require access to the database, you can specify the following arguments: 4 | 5 | - `--cassandra`: Specific address of your Scylla/Cassandra DB, if you are not running it locally (the format is `
:`, if you are using the default 9042 port then no need to specify it, e.g. `--cassandra scylla`) 6 | - `--keyspace`: Specific name of the Cassandra key space, defaults to `apollo` 7 | - `--tables`: Specific table mapping, use JSON format to modify it, e.g. `--tables {"bags": "bags_2", "hashes": "hashes_2", "hashtables": "hashtables_2", "hashtables2": "hashtables2_2"}` 8 | -------------------------------------------------------------------------------- /doc/cmd/dumpcc.md: -------------------------------------------------------------------------------- 1 | # DumpCC command 2 | 3 | This command outputs a report on the given Connected Components model to stdout, you must specify the following argument: 4 | 5 | - `-i`/`--input`: Path to the input Connected Components model 6 | -------------------------------------------------------------------------------- /doc/cmd/dumpcmd.md: -------------------------------------------------------------------------------- 1 | # DumpCMD command 2 | 3 | This command outputs a report on the given Community Detection model to stdout, you can specify the following arguments: 4 | 5 | - `--template`: Path to the `report.md.jinja2` file 6 | - `--batch`: Same as in `query` the number of hashes to query simultaneously, defaults to 100 7 | - `-i`/`--input`: Path to the input Community Detection model 8 | - [Cassandra/Scylla arguments](db.md) 9 | -------------------------------------------------------------------------------- /doc/cmd/evalcc.md: -------------------------------------------------------------------------------- 1 | # EvalCC command 2 | 3 | __Currently does not work in Spark Cluster mode.__ 4 | 5 | This command calculates the precise similarity and fitness metrics for the given Community Detection model, you can specify the following arguments: 6 | 7 | - `-i`/`--input`: Path to the input Community Detection model; 8 | - `-t`/`--threshold`: Jacquard Similarity threshold (float in [0,1]) over which we consider there is similarity - to calculate number of misses 9 | - [Cassandra/Scylla arguments](db.md) 10 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) 11 | 12 | __Note:__ 13 | 14 | To run this command it is advised to set the Spark parameter `spark.default.parallelism` to a 15 | higher value then the default 200 partitions if you are running on large amounts the data. 16 | -------------------------------------------------------------------------------- /doc/cmd/features.md: -------------------------------------------------------------------------------- 1 | # Feature arguments 2 | 3 | For all of the commands that extract features, you must specify the following arguments: 4 | 5 | - `-x`/`--mode`: Mode to select for analysis, defaults to `file`, can also be `repo` or `func` 6 | - `--quant`: Path to the input or output Quantization Levels model (optional, only supported for 7 | the `children` extractor) 8 | - `-f`/`--feature`: Features to extract from each item, at the moment among the ones below 9 | 10 | 11 | | Feature | Description | 12 | |----------|:---------------------------------:| 13 | | graphlet | Converts the UAST to a weighted bag of graphlets, a graphlet of a UAST node is composed from the node itself, its parent and its children| 14 | | lit | Converts the UAST to a weighted bag of literals (UAST node role) 15 | | id | Converts the UAST to a weighted bag of identifiers (UAST node role) 16 | | children | Converts the UAST to a bag of (internal type, quantized number of children) pairs, see [quantization](https://en.wikipedia.org/wiki/Quantization_(signal_processing)) for more info | 17 | | uast2seq | Converts the UAST to a bag of sequences of nodes, we use Depth First Search for the traversal of the UAST | 18 | | node2vec | Converts the UAST to a bag of vectorized sequences produced through a random walk 19 | 20 | You can check out the [Babelfish documentation](https://doc.bblf.sh/) for more information about UASTs. The weights of each feature in a bag are always computed from the observed frequencies. 21 | 22 | - `---`: For each of the above features you can also specify arguments: 23 | 24 | | Feature | Flag | Default | Description | 25 | |----------|:---------------------------------:|:-------:|:------------:| 26 | | graphlet | --graphlet-weight | 1 | Weight of this feature relative to the others (used by TF-IDF) | 27 | | lit | --lit-weight | 1 | Weight of this feature relative to the others (used by TF-IDF) | 28 | | id | --id-split-stem | False | Whether to split identifiers and consider each part to be a separate one, or not | 29 | | id | --id-weight | 1 | Weight of this feature relative to the others (used by TF-IDF) | 30 | | children | --children-npartitions | 10 | Number of partitions on which we apply quantization | 31 | | uast2seq | --uast2seq-weight | 1 | Weight of this feature relative to the others (used by TF-IDF) | 32 | | uast2seq | --uast2seq-seq-len | 5 | Length(s) of sequences, can be a list | 33 | | uast2seq | --uast2seq-stride | 1 | Stride used to iterate through the sequenced UAST to extract subsequences of chosen length | 34 | | node2vec | --node2vec-weight | 1 | Weight of this feature relative to the others (used by TF-IDF) 35 | | node2vec | --node2vec-seq-len | (5, 6) | Length(s) of sequences to be vectorized, can be a list 36 | | node2vec | --node2vec-p-explore-neighborhood | 0.5 | Likelihood of immediately revisiting a node in the walk (*return parameter*)| 37 | | node2vec | --node2vec-stride | 1 | Strides used to iterate through the walk sequences to extract subsequences of chosen length | 38 | | node2vec | --node2vec-seed | 42 | Seed to use to generate the random walk | 39 | | node2vec | --node2vec-q-leave-neighborhood | 0.5 | Modulates the ability to differentiate between inward and outward nodes (*in out parameter*) | 40 | | node2vec | --node2vec-n-walks | 5 | Number of walks from each node. | 41 | | node2vec | --node2vec-n-steps | 19 | Number of steps in each walk. | -------------------------------------------------------------------------------- /doc/cmd/hash.md: -------------------------------------------------------------------------------- 1 | # Hash command 2 | 3 | __Currently does not work in Spark Cluster mode.__ 4 | 5 | This command applies the MinHashCUDA algorithm on previously written batches, 6 | stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters 7 | in [this `Model`](/doc/model/wmh.md). You can specify the following arguments: 8 | 9 | - `-i`/`--input`: Path to the input batch(es) 10 | - `--seed`: Specific random generator (useful for cross execution comparisons), default to a random number depending of the time 11 | - `--mhc-verbosity`: MinHashCuda log level, specify 0 for silence or 2 for full logs, 1 is the default and just shows progress 12 | - `--devices`: Index of NVIDIA device to use, defaults to 0 (all available) 13 | - `--docfreq`: Path to the input Ordered Document Frequency model 14 | - `--size`: Hash size, defaults to 128 15 | - `--partitions`: to repartition data, this will specify new number of partitions 16 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 17 | - [Cassandra/Scylla arguments](db.md) 18 | - [Spark arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) 19 | 20 | You must also specify WMH arguments: 21 | 22 | - `-p`/`--params`: Path to the output WMH parameters 23 | - `-t`/`--threshold`: Jacquard Similarity threshold (float in [0,1]) over which we consider there is similarity 24 | - `--false-positive-weight`: Parameter that adjusts the relative importance of minimizing false positives count when optimizing for the Jacquard similarity threshold, default to .5 25 | - `--false-negative-weight`: Same for false negatives 26 | -------------------------------------------------------------------------------- /doc/cmd/preprocess.md: -------------------------------------------------------------------------------- 1 | # Preprocess command 2 | 3 | This command computes the index and Ordered Document Frequency model for the input repositories, 4 | and optionally the Quantization Levels model if selected features support it. Currently running the 5 | bags command on large inputs can result in failures, this allows you to create all the necessary 6 | data to run on subsets of your repositories. As you will be applying TF-IDF, be aware that your 7 | subsets must be disjoint, i.e. if you are running in `repo` mode then repos **must not** be spread 8 | out in different subsets, or there will be duplicate features. You can specify the following 9 | arguments: 10 | 11 | - `-r`/`--repositories` : Path to the input files 12 | - `--parquet`: If your input files are Parquet files 13 | - `--graph`: Path to the output Graphviz file, if you wish to keep the tree 14 | - `-l`/`--languages` : Languages to keep, defaults to all languages detected by Babelfish 15 | - `--dzhigurda`: Index of the last commit to keep, defaults to 0 (only the head), 1 is HEAD~2, etc 16 | - `--bow`: Path to the output batches 17 | - `--batch`: The maximum size of a single batch in bytes 18 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1 19 | - `--docfreq-out`: Path to the output Ordered Document Frequency model 20 | - `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million 21 | - `--cached-index-path`: Path to the output Document Frequency model storing the index of all documents 22 | - `--partitions`: to repartition data, this will specify new number of partitions 23 | - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 24 | - [Feature arguments](features.md) 25 | - [Spark and Engine arguments](https://github.com/src-d/ml/blob/master/doc/spark.md) 26 | -------------------------------------------------------------------------------- /doc/cmd/query.md: -------------------------------------------------------------------------------- 1 | # Query command 2 | 3 | This command finds items similar to the one specified, and outputs them using the `query.md.jinja2` report file. There are two query modes, that are mutually exclusive. For both of them you can specify the following arguments: 4 | 5 | - `--precise`: Whether to calculate the precise set or not 6 | - `--template`: Path to `query.md.jinja2` 7 | - `--batch`: Number of hashes to query simultaneously, defaults to 100 8 | - [Cassandra/Scylla arguments](db.md) 9 | 10 | **Id mode:** 11 | 12 | In this mode, the file is already in the databases, it's features have been extracted. You only need to specify which file you wish to pick with: 13 | 14 | - `-i`/`--id`: SHA1 identifier of the file. 15 | 16 | **File mode:** 17 | 18 | In this mode, the file is not in the database, so additionally we have to extract the bag of features from that file and apply the MinHashCUDA algorithm on them. You must specify the following arguments: 19 | 20 | - `-c`/`--file`: Absolute path of the file 21 | - `--bblfsh`: Same as in [engine arguments]((https://github.com/src-d/ml/blob/master/doc/spark.md)) 22 | - `--docfreq`: Path to the input Ordered Document Frequency model created while running the `bags`command (optional) 23 | - `--min-docfreq`: Specific minimum document frequency of each feature, defaults to 1 24 | - [Feature arguments](bags.md) also used by the `bags` command 25 | - [WMH arguments](hash.md) also used by the `hash` command 26 | -------------------------------------------------------------------------------- /doc/cmd/resetdb.md: -------------------------------------------------------------------------------- 1 | # Resetdb command 2 | 3 | This command destructively resets the database, you can specify the following arguments: 4 | 5 | - `--hashes-only`: To clear only the hash tables 6 | - [Cassandra/Scylla arguments](db.md) 7 | -------------------------------------------------------------------------------- /doc/gemini.md: -------------------------------------------------------------------------------- 1 | # Apollo vs. Gemini 2 | 3 | [Gemini](https://github.com/src-d/gemini) is mainly written in Scala and targets the production and 4 | bloody enterprise. Thus it is relatively less flexible but should be high performing and efficient. 5 | Apollo is a polygon for innovation which feeds all its research goodies to Gemini. Regarding the 6 | scalability, both are scalable and can process big amounts of data. 7 | 8 | Besides, Apollo is owned by Machine Learning team and Gemini is owned by the Applications team. 9 | 10 | ### Which one to choose? 11 | 12 | If your goal is doing research and trying new ideas, stick with Apollo. If you want to deduplicate 13 | terabytes of sources in your organization, go with Gemini. -------------------------------------------------------------------------------- /doc/install/README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Apollo can be installed in two ways: 4 | 5 | * Through [`pip install`](pip.md). 6 | * As a [docker container](docker.md). 7 | 8 | New users are recommended to use the container image so that they do not have to setup the complex 9 | environment. 10 | 11 | It is required to [initialize the database](db.md) once Apollo is installed. 12 | -------------------------------------------------------------------------------- /doc/install/db.md: -------------------------------------------------------------------------------- 1 | # Database initialization 2 | 3 | Run the following command: 4 | 5 | ``` 6 | apollo resetdb 7 | ``` 8 | 9 | [More about `resetdb`](../cmd/resetdb.md). -------------------------------------------------------------------------------- /doc/install/docker.md: -------------------------------------------------------------------------------- 1 | # Docker image installation 2 | 3 | ### Requirements 4 | 5 | * NVIDIA GPU 6 | 7 | Needed to be installed: 8 | 9 | * [Babelfish](https://doc.bblf.sh/user/getting-started.html) as `bblfshd`. 10 | * [Cassandra](https://hub.docker.com/r/library/cassandra/) or [ScyllaDB](https://hub.docker.com/r/scylladb/scylla/) as `cassandra`. 11 | 12 | ### Magic command 13 | 14 | ``` 15 | docker run --rm -it srcd/apollo --help 16 | ``` 17 | 18 | We will imply the following command by `apollo` throughout the examples: 19 | 20 | ``` 21 | docker run -it --rm -v /path/to/io:/io -w /io --privileged --link bblfshd --link cassandra srcd/apollo 22 | ``` 23 | 24 | `--privileged` is needed to access the NVIDIA devices inside the container without the pain of 25 | manually specifying them, can be replaced with `--device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0` + 26 | other `/dev/nvidia*` if you've got multiple cards. -------------------------------------------------------------------------------- /doc/install/pip.md: -------------------------------------------------------------------------------- 1 | # Pip installation 2 | 3 | ### Requirements 4 | 5 | * Python 3.4+ 6 | * Linux or macOS. **Windows will not work.** 7 | * NVIDIA GPU 8 | 9 | Needed to be installed: 10 | 11 | * [source{d} engine](https://github.com/src-d/engine) with all of the dependencies such as Babelfish 12 | * [libMHCUDA](https://github.com/src-d/minhashcuda) Python package 13 | * [sourced.ml](https://github.com/src-d/ml) @ `develop` branch 14 | * [Cassandra](http://cassandra.apache.org/) or [ScyllaDB](http://www.scylladb.com/) 15 | 16 | ### Magic command 17 | 18 | ``` 19 | pip3 install git+https://github.com/src-d/apollo 20 | ``` 21 | 22 | It should run without any errors. 23 | 24 | ### Testing 25 | 26 | ``` 27 | apollo --help 28 | ``` -------------------------------------------------------------------------------- /doc/model/cc.md: -------------------------------------------------------------------------------- 1 | # Connected Components Model 2 | 3 | This model stores the connected components found in the pairwise similarity 4 | graph after hashing by the `cc` command. 5 | 6 | **A quick reminder** 7 | 8 | A document hashes to as many buckets as there are hashtables, which means if there are 9 | 3 hashtables, then a document hashes to 3 buckets. The number of hashtables increases 10 | as the similarity threshold decreases. Any two documents that hash to at least one bucket 11 | in common are in the same component. 12 | 13 | The model has the following parameters: 14 | 15 | - `cc.id_to_cc`: a numpy array of integers of the size of the number of documents, where 16 | document `i` is in the community number `cc.id_to_cc[i]`; 17 | - `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary 18 | mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]` 19 | is file `i`'s filename; 20 | - `cc.id_to_buckets`: a Scipy sparse CSR matrix of the shape `number of documents` 21 | x `number of buckets`, where the element in row `i` and column `j` is equal to 1 if 22 | document `i` hashes to buck `j`, and 0 if not. 23 | 24 | Example: 25 | 26 | ``` 27 | from apollo.graph import ConnectedComponentsModel 28 | 29 | cc = ConnectedComponentsModel().load("cc.asdf") 30 | print(cc.dump()) # prints the number of CCs and documents 31 | ``` -------------------------------------------------------------------------------- /doc/model/cmd.md: -------------------------------------------------------------------------------- 1 | # Communities Model 2 | 3 | This model stores the communities detected by the `cmd` command from a previously 4 | created Connected Component model. It's contents heavily depends on the algorithm 5 | chosen (and it's parameters), but more importantly by the edge creation method, 6 | as is described in [the doc](/doc/cmd/cmd.md). Indeed, if the default linear method 7 | is chosen, then the communities will not only consist of documents, but also 8 | of **buckets**, as they will have been added to the CC graphs as artificial vertices. 9 | This means that, in this case, some communities may consist *only* of buckets. 10 | 11 | The model has the following parameters: 12 | 13 | - `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary 14 | mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]` 15 | is file `i`'s filename; 16 | - `cc.communities`: a list of lists of integers, where each integer in `cc.communities[i]` 17 | is in the `i`th community. If an element `e` in a community is an integer smaller 18 | then the length of the `cc.id_to_elements` dictionary, then it's a document. If not, 19 | it is the bucket number `e - len(cc.id_to_elements)` in the Connected Components 20 | model's `id_to_buckets` parameter which has been used as input. 21 | 22 | The model also has this method: 23 | - `cc.count_elements`: it counts the number of distinct documents in the communities 24 | (not all documents in the dictionary may be in a community, as we don't care for 25 | communities of one). Buckets are not counted by this method. 26 | 27 | Example: 28 | 29 | ``` 30 | from apollo.graph import CommunitiesModel 31 | 32 | cmd = CommunitiesModel().load("cc.asdf") 33 | print(cmd.dump()) # prints the number of communities (even if containing only buckets) 34 | print("Number of distinct documents: %s" % (cmd.count_elements())) 35 | ``` -------------------------------------------------------------------------------- /doc/model/wmh.md: -------------------------------------------------------------------------------- 1 | # Weighted MinHash Parameters Model 2 | 3 | This model stores the parameters generated by `libMHCUDA`'s `minhash_cuda_retrieve_vars` 4 | function, when running the `hash` command. Named like in Sergey Ioffe's paper, 5 | the parameters are: 6 | 7 | - `wmh.rs`: the quantization granularity; 8 | - `wmh.ln_cs` : the logarithm of the Cauchy variates; 9 | - `whh.betas`: the random offset. 10 | 11 | All 3 are Numpy arrays of the shape: `hash size` x `number of features`. If you have 12 | the wish, or need, to use the `hash` command multiple times, you should reuse this 13 | model each time, or the result will not be accurate, as the parameters will be 14 | regenerated at random. 15 | 16 | Example: 17 | 18 | ``` 19 | from apollo.hasher import WeightedMinHashParameters 20 | 21 | wmh = WeightedMinHashParameters().load("params.asdf") 22 | print(wmh.dump()) # prints the shape of matrices 23 | ``` -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | apollo: 4 | build: . 5 | image: srcd/apollo 6 | privileged: true 7 | stdin_open: true 8 | tty: true 9 | links: 10 | - bblfshd 11 | - scylla 12 | bblfshd: 13 | image: bblfsh/bblfshd 14 | privileged: true 15 | volumes: 16 | - /var/lib/bblfshd 17 | scylla: 18 | image: scylladb/scylla 19 | command: /docker-entrypoint.py --developer-mode=1 20 | volumes: 21 | - /var/lib/scylla -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cassandra_driver==3.14.0 2 | libMHCUDA==2.1.0 3 | python-igraph==0.7.1.post6 4 | jinja2==2.10 5 | sourced-ml==0.6.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from os import path 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name="apollo", 7 | description="source{d} Gemini's evil twin which runs everything using Python.", 8 | version="0.1.0", 9 | license="Apache 2.0", 10 | author="source{d}", 11 | author_email="machine-learning@sourced.tech", 12 | url="https://github.com/src-d/apollo", 13 | download_url="https://github.com/src-d/apollo", 14 | packages=find_packages(exclude=("apollo.tests",)), 15 | entry_points={ 16 | "console_scripts": ["apollo=apollo.__main__:main"], 17 | }, 18 | keywords=["machine learning on source code", "weighted minhash", "minhash", 19 | "bblfsh", "babelfish"], 20 | install_requires=["cassandra_driver >= 3.12.0, <4.0", 21 | "libMHCUDA >= 2.0, <3.0", 22 | "jinja2 >=2.0, <3.0", 23 | "python-igraph >= 0.7, <2.0", 24 | "sourced-ml >= 0.6.0, <0.7"], 25 | package_data={"": ["LICENSE", "README.md"] + glob(path.join("apollo", "*.jinja2"))}, 26 | classifiers=[ 27 | "Development Status :: 3 - Alpha", 28 | "Environment :: Console", 29 | "Intended Audience :: Developers", 30 | "License :: OSI Approved :: Apache Software License", 31 | "Operating System :: POSIX", 32 | "Programming Language :: Python :: 3.4", 33 | "Programming Language :: Python :: 3.5", 34 | "Programming Language :: Python :: 3.6", 35 | "Topic :: Software Development :: Libraries" 36 | ] 37 | ) 38 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from modelforge.logs import setup_logging 4 | 5 | 6 | utmain = sys.modules['__main__'] 7 | if utmain.__package__ == "unittest" and utmain.__spec__ is None: 8 | from collections import namedtuple 9 | ModuleSpec = namedtuple("ModuleSpec", ["name"]) 10 | utmain.__spec__ = ModuleSpec("unittest.__main__") 11 | del ModuleSpec 12 | del utmain 13 | 14 | 15 | def setup(): 16 | setup_logging("INFO") 17 | -------------------------------------------------------------------------------- /tests/test_graph_CommunityDetector.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import unittest 3 | 4 | from igraph import Graph 5 | 6 | from apollo.graph import CommunityDetector 7 | 8 | 9 | class CommunityDetectorTest(unittest.TestCase): 10 | def setUp(self): 11 | edges = [(0, 1)] 12 | weights = [1] 13 | nvertices = 2 14 | self.graph = Graph(n=nvertices, edges=edges, directed=False) 15 | self.graph.edge_weights = weights 16 | 17 | 18 | def test_generator(algorithm): 19 | def test_community_detection(self): 20 | cmd = CommunityDetector(algorithm=algorithm, config={}) 21 | res = cmd(self.graph) 22 | self.assertEqual(len(set(itertools.chain(*res))), 2) # Check number of unique vertices 23 | 24 | return test_community_detection 25 | 26 | 27 | if __name__ == "__main__": 28 | algorithms = ["spinglass", "optimal_modularity", "multilevel", "label_propagation", 29 | "leading_eigenvector", "leading_eigenvector", "infomap", "walktrap", 30 | "fastgreedy"] 31 | for algorithm in algorithms: 32 | test_name = "test_community_detection_%s" % algorithm 33 | test = test_generator(algorithm) 34 | setattr(CommunityDetectorTest, test_name, test) 35 | print([method for method in dir(CommunityDetectorTest) 36 | if "test_community_detection_" in method]) 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /tests/test_graph_ConnectedComponents.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import unittest 4 | 5 | from apollo.graph import _find_connected_component 6 | 7 | 8 | class ConnectedComponentsTest(unittest.TestCase): 9 | def test_empty_connected_component(self): 10 | buckets = [] 11 | element_to_buckets = defaultdict(set) 12 | 13 | res = _find_connected_component(buckets, element_to_buckets) 14 | self.assertEqual(0, len(res)) 15 | self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values()))) 16 | 17 | def test_one_connected_component(self): 18 | buckets = [] 19 | element_to_buckets = defaultdict(set) 20 | 21 | # Create one connected component 22 | for _ in range(5): 23 | bucket_id = len(buckets) 24 | buckets.append([bucket_id, bucket_id + 1]) 25 | element_to_buckets[bucket_id].add(bucket_id) 26 | element_to_buckets[bucket_id + 1].add(bucket_id) 27 | res = _find_connected_component(buckets, element_to_buckets) 28 | self.assertEqual(1, len(res)) 29 | self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values()))) 30 | 31 | def test_two_connected_components(self): 32 | buckets = [] 33 | element_to_buckets = defaultdict(set) 34 | 35 | # Create one connected component 36 | for _ in range(5): 37 | bucket_id = len(buckets) 38 | buckets.append([bucket_id, bucket_id + 1]) 39 | element_to_buckets[bucket_id].add(bucket_id) 40 | element_to_buckets[bucket_id + 1].add(bucket_id) 41 | 42 | bucket_id = len(buckets) 43 | buckets.append([bucket_id]) 44 | element_to_buckets[bucket_id].add(bucket_id) 45 | 46 | # Create another connected component 47 | for _ in range(5): 48 | bucket_id = len(buckets) 49 | buckets.append([bucket_id, bucket_id + 1]) 50 | element_to_buckets[bucket_id].add(bucket_id) 51 | element_to_buckets[bucket_id + 1].add(bucket_id) 52 | 53 | res = _find_connected_component(buckets, element_to_buckets) 54 | self.assertEqual(2, len(res)) 55 | self.assertTrue(set(itertools.chain(*buckets)) == set(itertools.chain(*res.values()))) 56 | 57 | 58 | if __name__ == "__main__": 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /tests/test_modify_feature_weights.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from copy import deepcopy 3 | import os 4 | import unittest 5 | from unittest.mock import patch 6 | import tempfile 7 | 8 | import numpy 9 | from sourced.ml.models import OrderedDocumentFrequencies 10 | # from sourced.ml.transformers import BagsBatch 11 | import sourced 12 | 13 | 14 | from apollo.hasher import modify_feature_weights 15 | 16 | 17 | class DummyClass: 18 | pass 19 | 20 | 21 | def dict_to_arguments(d): 22 | res = DummyClass() 23 | 24 | for key in d: 25 | setattr(res, key, d[key]) 26 | 27 | return res 28 | 29 | 30 | @unittest.skip("Skipping test until TODO is done in hasher:177 ") 31 | class FeatureWeightTest(unittest.TestCase): 32 | FakeExtractor = namedtuple("FakeExtractor", ("NAME", "NAMESPACE")) 33 | 34 | def setUp(self): 35 | docs = 1 36 | freq = 1 37 | default_weight = 1 38 | docfreqs = [] 39 | self.extractors = {} 40 | self.extractor_args = {} 41 | for i in range(2): 42 | namespace = "extractor%s." % i 43 | feat_freq = {} 44 | for j in range(2): 45 | feat_freq[namespace + str(j)] = freq 46 | docfreqs.append(feat_freq) 47 | 48 | self.extractors[namespace] = self.FakeExtractor(NAME=namespace, NAMESPACE=namespace) 49 | self.extractor_args["%s_weight" % namespace] = default_weight 50 | 51 | # Create tmp file and save OrderedDocumentFrequencies there 52 | self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False) 53 | model = OrderedDocumentFrequencies().construct(docs, docfreqs) 54 | model.save(self.tmp_file.name) 55 | 56 | # arguments.docfreq 57 | self.docfreq_args = {"docfreq": self.tmp_file.name} 58 | 59 | # batches 60 | self.batches = [] # [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))] 61 | 62 | def tearDown(self): 63 | self.tmp_file.close() 64 | try: 65 | os.remove(self.tmp_file.name) 66 | except OSError: 67 | pass 68 | 69 | def test_empty_extractors(self): 70 | arguments = dict_to_arguments(self.docfreq_args) 71 | with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True): 72 | result = modify_feature_weights(deepcopy(self.batches), arguments) 73 | self.assertEqual(len(result), len(self.batches)) 74 | for bathc_res, batch_init in zip(result, self.batches): 75 | bathc_res.matrix.sort_indices() 76 | batch_init.matrix.sort_indices() 77 | 78 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indices, 79 | batch_init.matrix.indices)) 80 | self.assertTrue(numpy.array_equal(bathc_res.matrix.data, batch_init.matrix.data)) 81 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr, 82 | batch_init.matrix.indptr)) 83 | 84 | def test_extractor_weight_1(self): 85 | self.docfreq_args.update(self.extractor_args) 86 | arguments = dict_to_arguments(self.docfreq_args) 87 | with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True): 88 | result = modify_feature_weights(deepcopy(self.batches), arguments) 89 | self.assertEqual(len(result), len(self.batches)) 90 | for bathc_res, batch_init in zip(result, self.batches): 91 | bathc_res.matrix.sort_indices() 92 | batch_init.matrix.sort_indices() 93 | 94 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indices, 95 | batch_init.matrix.indices)) 96 | self.assertTrue(numpy.array_equal(bathc_res.matrix.data, batch_init.matrix.data)) 97 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr, 98 | batch_init.matrix.indptr)) 99 | 100 | def test_empty_batches(self): 101 | self.docfreq_args.update(self.extractor_args) 102 | arguments = dict_to_arguments(self.docfreq_args) 103 | with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True): 104 | result = modify_feature_weights([], arguments) 105 | self.assertEqual(len(result), 0) 106 | 107 | def test_no_docfreq(self): 108 | no_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False) 109 | no_file.close() 110 | try: 111 | os.remove(no_file.name) 112 | except OSError: 113 | pass 114 | 115 | no_docfreq = {"docfreq": no_file.name} 116 | no_docfreq.update(self.extractor_args) 117 | arguments = dict_to_arguments(self.docfreq_args) 118 | with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True): 119 | self.assertRaises(Exception, modify_feature_weights(self.batches, arguments)) 120 | 121 | def test_normal_run(self): 122 | self.docfreq_args.update(self.extractor_args) 123 | weight = 2 124 | for key in self.docfreq_args: 125 | if "_weight" in key: 126 | self.docfreq_args[key] *= weight # make not 1 127 | arguments = dict_to_arguments(self.docfreq_args) 128 | with patch.dict(sourced.ml.extractors.__extractors__, self.extractors, clear=True): 129 | result = modify_feature_weights(deepcopy(self.batches), arguments) 130 | self.assertEqual(len(result), len(self.batches)) 131 | for bathc_res, batch_init in zip(result, self.batches): 132 | bathc_res.matrix.sort_indices() 133 | batch_init.matrix.sort_indices() 134 | 135 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indices, 136 | batch_init.matrix.indices)) 137 | self.assertTrue(numpy.array_equal(bathc_res.matrix.data, 138 | batch_init.matrix.data * weight)) 139 | self.assertTrue(numpy.array_equal(bathc_res.matrix.indptr, 140 | batch_init.matrix.indptr)) 141 | pass 142 | 143 | 144 | if __name__ == "__main__": 145 | unittest.main() 146 | --------------------------------------------------------------------------------