├── .dockerignore ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── DCO ├── Dockerfile ├── ISSUE_TEMPLATE.md ├── LICENSE ├── MAINTAINERS ├── Makefile ├── README.md ├── _examples ├── README.md ├── notebooks │ └── Example.ipynb ├── pyspark │ ├── pyspark-shell-basic.md │ ├── pyspark-shell-classifying-languages.md │ ├── pyspark-shell-lang-and-uast.md │ ├── pyspark-shell-raw-repositories.md │ ├── pyspark-shell-schemas.md │ ├── pyspark-shell-uast-extraction.md │ └── pyspark-shell-xpath-query.md ├── scala │ ├── spark-shell-basic.md │ ├── spark-shell-classifying-languages.md │ ├── spark-shell-lang-and-uast.md │ ├── spark-shell-raw-repositories.md │ ├── spark-shell-schemas.md │ ├── spark-shell-uast-extraction.md │ └── spark-shell-xpath-query.md └── siva-files │ ├── 2d58138f24fa863c235b0c33158b870a40c79ee2.siva │ ├── 5d4a8bf30c0da7209f651632b62a362620556c85.siva │ └── aac052c42c501abf6aa8c3509424e837bb27e188.siva ├── build.sbt ├── documentation └── proposals │ ├── ENIP-000.md │ ├── ENIP-001.md │ ├── ENIP-002.md │ ├── ENIP-003.md │ ├── ENIP-004.md │ └── README.md ├── key.asc.enc ├── project ├── Dependencies.scala ├── build.properties └── plugins.sbt ├── python ├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.rst ├── setup.cfg ├── setup.py ├── sourced │ ├── __init__.py │ ├── engine │ │ ├── __init__.py │ │ └── engine.py │ └── examples │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── repo_files.py │ │ ├── repo_references.py │ │ ├── repos.py │ │ └── uasts.py └── test │ ├── __init__.py │ ├── base.py │ ├── test_engine.py │ └── test_sourced_dataframe.py ├── sbt ├── scalastyle-config.xml └── src ├── main └── scala │ ├── org │ └── apache │ │ └── spark │ │ └── UtilsWrapper.scala │ └── tech │ └── sourced │ └── engine │ ├── DefaultSource.scala │ ├── Engine.scala │ ├── MetadataSource.scala │ ├── QueryBuilder.scala │ ├── Schema.scala │ ├── Sources.scala │ ├── TableBuilder.scala │ ├── compat │ └── compat.scala │ ├── exception │ └── RepositoryException.scala │ ├── iterator │ ├── BlobIterator.scala │ ├── ChainableIterator.scala │ ├── CleanupIterator.scala │ ├── CommitIterator.scala │ ├── GitTreeEntryIterator.scala │ ├── MetadataIterator.scala │ ├── ReferenceIterator.scala │ ├── RepositoryIterator.scala │ └── RootedRepo.scala │ ├── package.scala │ ├── provider │ ├── ReadOnlyFileRepository.scala │ ├── RepositoryProvider.scala │ └── RepositoryRDDProvider.scala │ ├── rule │ ├── AddSourceToAttributes.scala │ ├── RelationOptimizer.scala │ ├── SquashGitRelationsJoin.scala │ └── SquashMetadataRelationsJoin.scala │ ├── udf │ ├── ClassifyLanguagesUDF.scala │ ├── ConcatArrayUDF.scala │ ├── CustomUDF.scala │ ├── ExtractTokensUDF.scala │ ├── ExtractUASTsUDF.scala │ └── QueryXPathUDF.scala │ └── util │ ├── Bblfsh.scala │ ├── Filters.scala │ ├── GitUrlsParser.scala │ └── MD5Gen.scala └── test ├── resources ├── bad-siva-files │ └── 0a0bfaa46954437548fbaeb0e19237f84e968511.siva ├── log4j.properties ├── siva-files │ ├── 05893125684f2d3943cd84a7ab2b75e53668fba1.siva │ ├── ff │ │ └── fff840f8784ef162dc83a1465fc5763d890b68ba.siva │ ├── fff7062de8474d10a67d417ccea87ba6f58ca81d.siva │ └── not-siva.txt └── zip-slip-siva-files │ └── git-zipslip.siva └── scala └── tech └── sourced └── engine ├── BaseSivaSpec.scala ├── BaseSourceSpec.scala ├── BaseSparkSpec.scala ├── DefaultSourceSpec.scala ├── EngineSpec.scala ├── FilterUDFSpec.scala ├── MetadataSourceSpec.scala ├── QueryBuilderSpec.scala ├── StorageLevelSpec.scala ├── iterator ├── BaseChainableIterator.scala ├── BlobIteratorSpec.scala ├── CommitIteratorSpec.scala ├── GitTreeEntryIteratorSpec.scala ├── MetadataIteratorSpec.scala ├── ReferenceIteratorSpec.scala └── RepositoryIteratorSpec.scala ├── provider ├── RepositoryProviderSpec.scala └── RepositoryRDDProviderSpec.scala ├── udf └── CustomUDFSpec.scala └── util ├── FilterSpec.scala └── RepoUtils.scala /.dockerignore: -------------------------------------------------------------------------------- 1 | # Exclude all directories and files except those used by Dockerfile to build the image 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | target/* 4 | project/target/* 5 | project/project/* 6 | .idea 7 | .docsrv-resources 8 | /bin/ 9 | .cache* 10 | .#* 11 | .project 12 | .settings 13 | key.asc 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: required 3 | 4 | language: scala 5 | scala: 2.11.11 6 | 7 | cache: 8 | directories: 9 | - $HOME/.sbt 10 | - $HOME/.ivy2 11 | 12 | services: 13 | - docker 14 | 15 | install: 16 | - | 17 | set -e 18 | if [[ ${LANGUAGE} = python ]]; then 19 | sudo apt install libxml2-dev curl build-essential 20 | make build 21 | cd python 22 | pip install -e . 23 | fi 24 | 25 | before_script: 26 | - make -f "$TRAVIS_BUILD_DIR/Makefile" docker-bblfsh 27 | - make -f "$TRAVIS_BUILD_DIR/Makefile" docker-bblfsh-install-drivers 28 | 29 | script: 30 | - if [[ ${LANGUAGE} = python ]]; then make test ;fi 31 | - | 32 | set -e 33 | if [[ ${LANGUAGE} = java ]]; then 34 | make travis-test 35 | bash <(curl -s https://codecov.io/bash) 36 | fi 37 | 38 | jobs: 39 | include: 40 | - {env: 'LANGUAGE=java SPARK_VERSION=2.2.1', jdk: openjdk8} 41 | - {env: 'LANGUAGE=java SPARK_VERSION=2.3.1', jdk: openjdk8} 42 | - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.4, language: python} 43 | - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.5, language: python} 44 | - {env: 'LANGUAGE=python SPARK_VERSION=2.2.1', python: 3.6, language: python} 45 | - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.6, language: python} 46 | 47 | - stage: deploy 48 | if: tag IS present OR (branch = master AND env(TRAVIS_PULL_REQUEST) IS present) 49 | jdk: openjdk8 50 | 51 | install: skip 52 | before_script: skip 53 | 54 | script: 55 | - openssl aes-256-cbc -K $encrypted_8a9ac81f2640_key -iv $encrypted_8a9ac81f2640_iv -in key.asc.enc -out key.asc -d 56 | - gpg --no-default-keyring --primary-keyring ./project/.gnupg/pubring.gpg --secret-keyring ./project/.gnupg/secring.gpg --keyring ./project/.gnupg/pubring.gpg --fingerprint --import key.asc 57 | - make build 58 | - cp target/jgit-spark-connector-uber.jar "jgit-spark-connector-$TRAVIS_TAG.jar" 59 | - make docker-push 60 | 61 | deploy: 62 | - provider: script 63 | script: make maven-release 64 | skip_cleanup: true 65 | on: 66 | tags: true 67 | - provider: releases 68 | api_key: 69 | secure: $GITHUB_TOKEN 70 | file_glob: true 71 | file: "*.jar" 72 | skip_cleanup: true 73 | on: 74 | tags: true 75 | 76 | - if: tag IS present 77 | language: python 78 | python: 3.6 79 | 80 | script: 81 | - sudo apt install libxml2-dev curl build-essential 82 | - make build 83 | - cd python 84 | - pip install -e . 85 | - echo "$TRAVIS_TAG" | cut -c 2- > version.txt 86 | 87 | deploy: 88 | - provider: pypi 89 | user: $PYPI_USERNAME 90 | password: $PYPI_PASSWORD 91 | skip_cleanup: true 92 | on: 93 | tags: true 94 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at conduct@sourced.tech. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 660 York Street, Suite 102, 6 | San Francisco, CA 94110 USA 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | 12 | Developer's Certificate of Origin 1.1 13 | 14 | By making a contribution to this project, I certify that: 15 | 16 | (a) The contribution was created in whole or in part by me and I 17 | have the right to submit it under the open source license 18 | indicated in the file; or 19 | 20 | (b) The contribution is based upon previous work that, to the best 21 | of my knowledge, is covered under an appropriate open source 22 | license and I have the right under that license to submit that 23 | work with modifications, whether created in whole or in part 24 | by me, under the same open source license (unless I am 25 | permitted to submit under a different license), as indicated 26 | in the file; or 27 | 28 | (c) The contribution was provided directly to me by some other 29 | person who certified (a), (b) or (c) and I have not modified 30 | it. 31 | 32 | (d) I understand and agree that this project and the contribution 33 | are public and that a record of the contribution (including all 34 | personal information I submit with it, including my sign-off) is 35 | maintained indefinitely and may be redistributed consistent with 36 | this project or the open source license(s) involved. 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jdk as builder 2 | RUN apt-get update && apt-get install -y --no-install-recommends git 3 | RUN mkdir /jgit-spark-connector 4 | WORKDIR /jgit-spark-connector 5 | COPY . /jgit-spark-connector 6 | RUN ./sbt assembly 7 | 8 | FROM srcd/jupyter-spark:5.2.1 9 | 10 | RUN mkdir -p /opt/ 11 | 12 | # jgit-spark-connector jar location 13 | ENV SPARK_DRIVER_EXTRA_CLASSPATH spark.driver.extraClassPath 14 | ENV SPARK_EXECUTOR_EXTRA_CLASSPATH spark.executor.extraClassPath 15 | ENV SRCD_JAR /opt/jars/jgit-spark-connector-uber.jar 16 | 17 | # bblfsh endpoint variables 18 | ENV SPARK_BBLFSH_HOST spark.tech.sourced.bblfsh.grpc.host 19 | ENV BBLFSH_HOST bblfshd 20 | ENV SPARK_BBLFSH_PORT spark.tech.sourced.bblfsh.grpc.port 21 | ENV BBLFSH_PORT 9432 22 | 23 | USER root 24 | 25 | RUN apt-get update && \ 26 | apt-get install -y --no-install-suggests --no-install-recommends locales curl g++ libxml2-dev && \ 27 | apt-get clean && \ 28 | locale-gen en_US.UTF-8 29 | 30 | ENV LANG en_US.UTF-8 31 | 32 | COPY ./python /opt/python-jgit-spark-connector/ 33 | COPY ./_examples/notebooks/* /home/$NB_USER/ 34 | COPY --from=builder /jgit-spark-connector/target/jgit-spark-connector-uber.jar /opt/jars/ 35 | 36 | 37 | RUN echo "local" > /opt/python-jgit-spark-connector/version.txt \ 38 | && pip install -e /opt/python-jgit-spark-connector/ \ 39 | && pip install jupyter-spark \ 40 | && jupyter serverextension enable --py jupyter_spark \ 41 | && jupyter nbextension install --py jupyter_spark \ 42 | && jupyter nbextension enable --py jupyter_spark \ 43 | && jupyter nbextension enable --py widgetsnbextension 44 | 45 | # Separate the config file in a different RUN creation as this may change more often 46 | RUN echo "$SPARK_DRIVER_EXTRA_CLASSPATH $SRCD_JAR\n$SPARK_EXECUTOR_EXTRA_CLASSPATH $SRCD_JAR" >> /usr/local/spark/conf/spark-defaults.conf \ 47 | && echo "$SPARK_BBLFSH_HOST $BBLFSH_HOST\n$SPARK_BBLFSH_PORT $BBLFSH_PORT" >> /usr/local/spark/conf/spark-defaults.conf 48 | 49 | # Disable jupyter token 50 | RUN mkdir -p /root/.jupyter && \ 51 | echo "c.NotebookApp.token = ''" > ~/.jupyter/jupyter_notebook_config.py && \ 52 | echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py && \ 53 | echo "c.NotebookApp.notebook_dir = '/home'" >> ~/.jupyter/jupyter_notebook_config.py && \ 54 | echo "c.NotebookApp.port = 8080" >> ~/.jupyter/jupyter_notebook_config.py 55 | 56 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Your issue may already be reported! 2 | Please search on the [issue track](../) before creating one. 3 | 4 | ## Expected Behavior 5 | 6 | 7 | 8 | ## Current Behavior 9 | 10 | 11 | 12 | ## Possible Solution 13 | 14 | 15 | 16 | ## Steps to Reproduce (for bugs) 17 | 18 | 19 | 1. 20 | 2. 21 | 3. 22 | 4. 23 | 24 | ## Context 25 | 26 | 27 | 28 | ## Your Environment (for bugs) 29 | 30 | * Spark version: 31 | * PySpark version (if using PySpark): 32 | * jgit-spark-connector version: 33 | * Operating System and version: 34 | * Some needed resources to reproduce the problem: 35 | -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Antonio Navarro Perez (@ajnavarro) 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Docsrv: configure the languages whose api-doc can be auto generated 2 | LANGUAGES = "go scala python" 3 | # Docsrv: configure the directory containing the python sources 4 | PYTHON_MAIN_DIR ?= ./python 5 | # Docs: do not edit this 6 | DOCS_REPOSITORY := https://github.com/src-d/docs 7 | SHARED_PATH ?= $(shell pwd)/.docsrv-resources 8 | DOCS_PATH ?= $(SHARED_PATH)/.docs 9 | $(DOCS_PATH)/Makefile.inc: 10 | git clone --quiet --depth 1 $(DOCS_REPOSITORY) $(DOCS_PATH); 11 | -include $(DOCS_PATH)/Makefile.inc 12 | 13 | # Docker 14 | DOCKER_CMD = docker 15 | DOCKER_BUILD = $(DOCKER_CMD) build 16 | DOCKER_TAG ?= $(DOCKER_CMD) tag 17 | DOCKER_PUSH ?= $(DOCKER_CMD) push 18 | DOCKER_RUN = $(DOCKER_CMD) run 19 | DOCKER_RMI = $(DOCKER_CMD) rmi -f 20 | DOCKER_EXEC = $(DOCKER_CMD) exec 21 | 22 | # Docker run bblfsh server container 23 | BBLFSH_CONTAINER_NAME = bblfshd 24 | BBLFSH_HOST_PORT = 9432 25 | BBLFSH_CONTAINER_PORT = 9432 26 | BBLFSH_HOST_VOLUME = /var/lib/bblfshd 27 | BBLFSH_CONTAINER_VOLUME = /var/lib/bblfshd 28 | BBLFSH_IMAGE = bblfsh/bblfshd 29 | BBLFSH_VERSION = v2.5.0 30 | 31 | BBLFSH_RUN_FLAGS := --detach --name $(BBLFSH_CONTAINER_NAME) --privileged \ 32 | -p $(BBLFSH_HOST_PORT):$(BBLFSH_CONTAINER_PORT) \ 33 | -v $(BBLFSH_HOST_VOLUME):$(BBLFSH_CONTAINER_VOLUME) \ 34 | $(BBLFSH_IMAGE):$(BBLFSH_VERSION) 35 | 36 | BBLFSH_EXEC_FLAGS = -it 37 | BBLFSH_CTL = bblfshctl 38 | BBLFSH_CTL_DRIVER := $(BBLFSH_CTL) driver 39 | 40 | BBLFSH_CTL_LIST_DRIVERS := $(BBLFSH_CTL_DRIVER) list 41 | BBLFSH_EXEC_LIST_COMMAND := $(BBLFSH_CONTAINER_NAME) bblfshctl driver list 42 | BBLFSH_LIST_DRIVERS := $(BBLFSH_EXEC_FLAGS) $(BBLFSH_EXEC_LIST_COMMAND) 43 | 44 | 45 | # escape_docker_tag escape colon char to allow use a docker tag as rule 46 | define escape_docker_tag 47 | $(subst :,--,$(1)) 48 | endef 49 | 50 | # unescape_docker_tag an escaped docker tag to be use in a docker command 51 | define unescape_docker_tag 52 | $(subst --,:,$(1)) 53 | endef 54 | 55 | # Docker jupyter image tag 56 | GIT_COMMIT=$(shell git rev-parse HEAD | cut -c1-7) 57 | GIT_DIRTY= 58 | ifneq ($(shell git status --porcelain), ) 59 | GIT_DIRTY := -dirty 60 | endif 61 | DEV_PREFIX := dev 62 | VERSION ?= $(DEV_PREFIX)-$(GIT_COMMIT)$(GIT_DIRTY) 63 | 64 | # Docker jupyter image 65 | JUPYTER_IMAGE ?= srcd/jgit-spark-connector-jupyter 66 | JUPYTER_IMAGE_VERSIONED ?= $(call escape_docker_tag,$(JUPYTER_IMAGE):$(VERSION)) 67 | 68 | # Docker run jupyter container 69 | JUPYTER_CONTAINER_NAME = jgit-spark-connector-jupyter 70 | JUPYTER_HOST_PORT = 8080 71 | JUPYTER_CONTAINER_PORT = 8080 72 | REPOSITORIES_HOST_DIR := $(PWD)/_examples/siva-files 73 | REPOSITORIES_CONTAINER_DIR = /repositories 74 | JUPYTER_RUN_FLAGS := --name $(JUPYTER_CONTAINER_NAME) --rm -it \ 75 | -p $(JUPYTER_HOST_PORT):$(JUPYTER_CONTAINER_PORT) \ 76 | -v $(REPOSITORIES_HOST_DIR):$(REPOSITORIES_CONTAINER_DIR) \ 77 | --link $(BBLFSH_CONTAINER_NAME):$(BBLFSH_CONTAINER_NAME) \ 78 | $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) 79 | 80 | # Versions 81 | SCALA_VERSION ?= 2.11.11 82 | SPARK_VERSION ?= 2.2.1 83 | 84 | # if TRAVIS_SCALA_VERSION defined SCALA_VERSION is overrided 85 | ifneq ($(TRAVIS_SCALA_VERSION), ) 86 | SCALA_VERSION := $(TRAVIS_SCALA_VERSION) 87 | endif 88 | 89 | # if TRAVIS_TAG defined VERSION is overrided 90 | ifneq ($(TRAVIS_TAG), ) 91 | VERSION := $(TRAVIS_TAG) 92 | endif 93 | 94 | # if we are not in master, and it's not a tag the push is disabled 95 | ifneq ($(TRAVIS_BRANCH), master) 96 | ifeq ($(TRAVIS_TAG), ) 97 | pushdisabled = "push disabled for non-master branches" 98 | endif 99 | endif 100 | 101 | # if this is a pull request, the push is disabled 102 | ifneq ($(TRAVIS_PULL_REQUEST), false) 103 | pushdisabled = "push disabled for pull-requests" 104 | endif 105 | 106 | #SBT 107 | SBT = ./sbt ++$(SCALA_VERSION) -Dspark.version=$(SPARK_VERSION) 108 | 109 | # Rules 110 | all: clean build 111 | 112 | clean: 113 | $(SBT) clean 114 | 115 | test: 116 | $(SBT) test 117 | 118 | build: 119 | $(SBT) assembly 120 | 121 | travis-test: 122 | $(SBT) clean coverage test coverageReport scalastyle test:scalastyle 123 | 124 | docker-bblfsh: 125 | $(DOCKER_RUN) $(BBLFSH_RUN_FLAGS) 126 | 127 | docker-bblfsh-install-drivers: 128 | $(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install go bblfsh/go-driver:v0.4.0 129 | $(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install python bblfsh/python-driver:v2.0.0 130 | $(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install java bblfsh/java-driver:v1.2.6 131 | $(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install ruby bblfsh/ruby-driver:v2.0.0 132 | 133 | docker-bblfsh-list-drivers: 134 | $(DOCKER_EXEC) $(BBLFSH_LIST_DRIVERS) 135 | 136 | docker-build: 137 | $(if $(pushdisabled),$(error $(pushdisabled))) 138 | 139 | $(DOCKER_BUILD) -t $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) . 140 | 141 | docker-run: 142 | $(DOCKER_RUN) $(JUPYTER_RUN_FLAGS) 143 | 144 | docker-clean: 145 | $(DOCKER_RMI) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) 146 | 147 | docker-push: docker-build 148 | $(if $(pushdisabled),$(error $(pushdisabled))) 149 | 150 | @if [ "$$DOCKER_USERNAME" != "" ]; then \ 151 | $(DOCKER_CMD) login -u="$$DOCKER_USERNAME" -p="$$DOCKER_PASSWORD"; \ 152 | fi; 153 | 154 | $(DOCKER_PUSH) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) 155 | @if [ "$$TRAVIS_TAG" != "" ]; then \ 156 | $(DOCKER_TAG) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) \ 157 | $(call unescape_docker_tag,$(JUPYTER_IMAGE)):latest; \ 158 | $(DOCKER_PUSH) $(call unescape_docker_tag,$(JUPYTER_IMAGE):latest); \ 159 | fi; 160 | 161 | maven-release: 162 | $(SBT) clean publishSigned && \ 163 | $(SBT) sonatypeRelease 164 | -------------------------------------------------------------------------------- /_examples/README.md: -------------------------------------------------------------------------------- 1 | # jgit-spark-connector 2 | 3 | Here you can find a list of annotated *jgit-spark-connector* examples: 4 | 5 | ### pyspark 6 | 7 | - [pyspark's shell basic example](pyspark/pyspark-shell-basic.md) 8 | 9 | - [pyspark's shell UAST extraction](pyspark/pyspark-shell-uast-extraction.md) 10 | 11 | - [pyspark's shell classifying languages](pyspark/pyspark-shell-classifying-languages.md) 12 | 13 | - [pyspark's shell data schemas](pyspark/pyspark-shell-schemas.md) 14 | 15 | - [pyspark's shell classifying languages and extracting UASTs](pyspark/pyspark-shell-lang-and-uast.md) 16 | 17 | - [pyspark's shell querying UASTs with XPath](pyspark/pyspark-shell-xpath-query.md) 18 | 19 | - [pyspark's shell raw repositories](pyspark/pyspark-shell-raw-repositories.md) 20 | 21 | ### scala 22 | 23 | - [spark-shell basic example](scala/spark-shell-basic.md) 24 | 25 | - [spark-shell UAST extraction](scala/spark-shell-uast-extraction.md) 26 | 27 | - [spark-shell classifying languages](scala/spark-shell-classifying-languages.md) 28 | 29 | - [spark-shell data schemas](scala/spark-shell-schemas.md) 30 | 31 | - [spark-shell classifying languages and extracting UASTs](scala/spark-shell-lang-and-uast.md) 32 | 33 | - [spark-shell querying UASTs with XPath](scala/spark-shell-xpath-query.md) 34 | 35 | - [spark-shell raw repositories](scala/spark-shell-raw-repositories.md) 36 | 37 | ### jupyter notebooks 38 | 39 | - [Basic example](notebooks/Example.ipynb) 40 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-basic.md: -------------------------------------------------------------------------------- 1 | ## Basic example 2 | 3 | In this example, the pyspark-shell is used to show a simple usage of the source{d} jgit-spark-connector. 4 | 5 | First, you can see how to import the package and instantiate and object that provide all the methods to manipulate the data retrieved from repositories. 6 | 7 | The `engine` object is used to get all the repositories, get the `HEAD` references from the repositories and eventually, get all the blobs from these references. Then a table is showed selecting the columns `blob_id`, `path` and `content`. 8 | 9 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 10 | ```sh 11 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 12 | ``` 13 | 14 | Code 15 | ```python 16 | from sourced.engine import Engine 17 | engine = Engine(spark, '/path/to/siva-files', 'siva') 18 | engine.repositories.references.head_ref.commits.tree_entries.blobs.select('blob_id', 'path', 'content').show() 19 | 20 | ''' Output: 21 | +--------------------+--------------------+--------------------+ 22 | | blob_id | path| content| 23 | +--------------------+--------------------+--------------------+ 24 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[64 65 66 20 66 6...| 25 | |7268016814b8ab7bc...| gcd/gcd.py|[69 6D 70 6F 72 7...| 26 | |25dbfff34dcc8d252...| README.md|[23 20 66 75 6E 6...| 27 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[69 6D 70 6F 72 7...| 28 | |63bd495dce1d53092...|factorial/factori...|[69 6D 70 6F 72 7...| 29 | |bf17d9730e43f5697...| .travis.yml|[6C 61 6E 67 75 6...| 30 | |a697a655a7bfd6ba1...| prime/is_prime.py|[64 65 66 20 69 7...| 31 | |76052f368f4c9c8de...|pythagorean_tripl...|[66 72 6F 6D 20 7...| 32 | |3be2253ba2e871d3b...|prime/is_prime_op...|[69 6D 70 6F 72 7...| 33 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[69 6D 70 6F 72 7...| 34 | |7268016814b8ab7bc...| gcd/gcd.py|[69 6D 70 6F 72 7...| 35 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[69 6D 70 6F 72 7...| 36 | |4d3617f27e277e4b5...|differentiation/s...|[66 72 6F 6D 20 7...| 37 | |4d3617f27e277e4b5...|differentiation/s...|[66 72 6F 6D 20 7...| 38 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[64 65 66 20 67 6...| 39 | |8ab978a56c5dcb239...|factorial/factori...|[64 65 66 20 66 6...| 40 | |e35a52f431feac4b7...| abs/abs.py|[69 6D 70 6F 72 7...| 41 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[69 6D 70 6F 72 7...| 42 | |51bdeff4494d60bb7...|euclidean/distanc...|[69 6D 70 6F 72 7...| 43 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[64 65 66 20 67 6...| 44 | +--------------------+--------------------+--------------------+ 45 | only showing top 20 rows 46 | ''' 47 | ``` 48 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-classifying-languages.md: -------------------------------------------------------------------------------- 1 | ## Classifying languages example 2 | 3 | This example uses the pyspark-shell to show how to classify blobs by their language with `classify_languages()`. 4 | 5 | Making use of the `engine` object, it retrieves repositories to get all blobs from the `HEAD` references from them. After that, a call to `classify_languages()` function detects the language for each file to show them in the aggregated column `lang` beside the selected columns `blob_id` and `path`. 6 | 7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```python 14 | from sourced.engine import Engine 15 | engine = Engine(spark, '/path/to/siva-files', 'siva') 16 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().select("blob_id", "path", "lang").show() 17 | 18 | ''' Output: 19 | +--------------------+--------------------+--------+ 20 | | blob_id| path| lang| 21 | +--------------------+--------------------+--------+ 22 | |ff4fa0794274a7ffb...|fibonacci/fibonac...| Python| 23 | |7268016814b8ab7bc...| gcd/gcd.py| Python| 24 | |25dbfff34dcc8d252...| README.md|Markdown| 25 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python| 26 | |63bd495dce1d53092...|factorial/factori...| Python| 27 | |bf17d9730e43f5697...| .travis.yml| YAML| 28 | |a697a655a7bfd6ba1...| prime/is_prime.py| Python| 29 | |76052f368f4c9c8de...|pythagorean_tripl...| Python| 30 | |3be2253ba2e871d3b...|prime/is_prime_op...| Python| 31 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...| Python| 32 | |7268016814b8ab7bc...| gcd/gcd.py| Python| 33 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...| Python| 34 | |4d3617f27e277e4b5...|differentiation/s...| Python| 35 | |4d3617f27e277e4b5...|differentiation/s...| Python| 36 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python| 37 | |8ab978a56c5dcb239...|factorial/factori...| Python| 38 | |e35a52f431feac4b7...| abs/abs.py| Python| 39 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python| 40 | |51bdeff4494d60bb7...|euclidean/distanc...| Python| 41 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python| 42 | +--------------------+--------------------+--------+ 43 | only showing top 20 rows 44 | ''' 45 | ``` 46 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-lang-and-uast.md: -------------------------------------------------------------------------------- 1 | ## Classifying languages and extracting UASTs example 2 | 3 | The combined usage of both, `classify_languages()` and `extract_uasts()` methods, has the advantage that doesn't rely the language detection task on the [bblfsh server](https://github.com/bblfsh/server) , so you can save some time. 4 | 5 | To do that, you just have to call `extract_uasts()` on a Dataframe where previously, `classify_languages()` was used. 6 | 7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```python 14 | from sourced.engine import Engine 15 | engine = Engine(spark, '/path/to/siva-files', 'siva') 16 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().extract_uasts().select("path", "lang", "uast").show() 17 | 18 | ''' Output: 19 | +--------------------+--------+-------------+ 20 | | path| lang| uast| 21 | +--------------------+--------+-------------+ 22 | |fibonacci/fibonac...| Python|[[B@759dfd4e]| 23 | | gcd/gcd.py| Python| [[B@36ea40c]| 24 | | README.md|Markdown| []| 25 | |prime/is_prime_op...| Python|[[B@2da632d5]| 26 | |factorial/factori...| Python| [[B@37e738]| 27 | | .travis.yml| YAML| []| 28 | | prime/is_prime.py| Python|[[B@1ada1dfd]| 29 | |pythagorean_tripl...| Python|[[B@6ce2846e]| 30 | |prime/is_prime_op...| Python|[[B@704e33bd]| 31 | |prime/is_prime_op...| Python|[[B@4fff14ab]| 32 | | gcd/gcd.py| Python| [[B@580cd5c]| 33 | |gcd/gcd_optimal_e...| Python|[[B@7db9e876]| 34 | |differentiation/s...| Python|[[B@7c6befa7]| 35 | |differentiation/s...| Python|[[B@4b06f6cd]| 36 | | gcd/gcd.py| Python|[[B@486f38dc]| 37 | |factorial/factori...| Python|[[B@7a2783ff]| 38 | | abs/abs.py| Python|[[B@59124dcb]| 39 | |prime/is_prime_op...| Python|[[B@25de68ba]| 40 | |euclidean/distanc...| Python|[[B@14c61d05]| 41 | | gcd/gcd.py| Python|[[B@52b84c19]| 42 | +--------------------+--------+-------------+ 43 | only showing top 20 rows 44 | ''' 45 | ``` 46 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-raw-repositories.md: -------------------------------------------------------------------------------- 1 | # Raw repositories usage 2 | 3 | In this example, the pyspark-shell is used to show the usage of source{d} jgit-spark-connector with raw git repositories. 4 | 5 | ## Differences with siva usage 6 | 7 | What are the main differences between using the jgit-spark-connector with siva files and raw git repositories? 8 | 9 | * Raw repositories can have non-remote references, siva files do not. 10 | * Even if you have only one repository, you may have N repositories in the output returned by the jgit-spark-connector. That's because different origins are treated as different repositories. In short, you'll have as many repositories as remotes in your repository plus one repository that corresponds to the local repository, which is identified by `file://$PATH_TO_REPOSITORY`. This one will always contain non-remote references and the rest of the repositories will always contain remote references. 11 | 12 | **Note:** raw repositories refer to `standard` and `bare` repositories. 13 | 14 | ## Getting repository references 15 | 16 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 17 | ```sh 18 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 19 | ``` 20 | 21 | So we can get the repositories like this and we can see that even if we only have one repository, jgit-spark-connector says we have two: 22 | 23 | ```python 24 | from sourced.engine import Engine 25 | engine = Engine(spark, '/path/to/repositories', 'standard') 26 | print(engine.repositories.count()) 27 | 28 | '''Output: 29 | 2 30 | ''' 31 | ``` 32 | 33 | Getting references: 34 | 35 | ```python 36 | print(engine.repositories.references.count()) 37 | 38 | '''Output: 39 | 4 40 | ''' 41 | ``` 42 | 43 | If you want a behavior that's more similar to siva files usage you can filter out non-remote references: 44 | 45 | ```python 46 | references = engine.repositories.references 47 | print(references.filter(references.is_remote == True).count()) 48 | 49 | '''Output: 50 | 2 51 | ''' 52 | ``` 53 | 54 | Alternately, you can use the following shorthand: 55 | 56 | ```python 57 | print(engine.repositories.remote_references.count()) 58 | 59 | '''Output: 60 | 2 61 | ''' 62 | ``` 63 | 64 | ### Caveats 65 | 66 | Note that even if in your repository there's a reference named `refs/remotes/origin/master` it will be converted to a reference named `refs/heads/master` that belongs to the repository identified by your origin remote URL. 67 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-schemas.md: -------------------------------------------------------------------------------- 1 | ## Printing schema example 2 | 3 | The next example showed, just try to show the simple usage of the useful method `printSchema()`. 4 | 5 | It can help you to follow the aggregated or pruned information that your transformations make on the data you are handling. 6 | 7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```python 14 | from sourced.engine import Engine 15 | engine = Engine(spark, '/path/to/siva-files', 'siva') 16 | 17 | engine.repositories.printSchema() 18 | ''' Output: 19 | root 20 | |-- id: string (nullable = false) 21 | |-- urls: array (nullable = false) 22 | | |-- element: string (containsNull = false) 23 | |-- is_fork: boolean (nullable = true) 24 | ''' 25 | 26 | engine.repositories.references.printSchema() 27 | ''' Output: 28 | root 29 | |-- repository_id: string (nullable = false) 30 | |-- name: string (nullable = false) 31 | |-- hash: string (nullable = false) 32 | |-- is_remote: boolean (nullable = false) 33 | ''' 34 | 35 | engine.repositories.references.commits.printSchema() 36 | ''' 37 | Also: engine.repositories.references.all_reference_commits.printSchema() 38 | ''' 39 | ''' Output: 40 | root 41 | |-- repository_id: string (nullable = false) 42 | |-- reference_name: string (nullable = false) 43 | |-- index: integer (nullable = false) 44 | |-- hash: string (nullable = false) 45 | |-- message: string (nullable = false) 46 | |-- parents: array (nullable = true) 47 | | |-- element: string (containsNull = false) 48 | |-- parents_count: integer (nullable = false) 49 | |-- author_email: string (nullable = true) 50 | |-- author_name: string (nullable = true) 51 | |-- author_date: timestamp (nullable = true) 52 | |-- committer_email: string (nullable = true) 53 | |-- committer_name: string (nullable = true) 54 | |-- committer_date: timestamp (nullable = true) 55 | ''' 56 | 57 | engine.repositories.references.commits.tree_entries.printSchema() 58 | ''' Output: 59 | root 60 | |-- commit_hash: string (nullable = false) 61 | |-- repository_id: string (nullable = false) 62 | |-- reference_name: string (nullable = false) 63 | |-- path: string (nullable = true) 64 | |-- blob: string (nullable = false) 65 | ''' 66 | 67 | engine.repositories.references.commits.tree_entries.blobs.printSchema() 68 | ''' Output: 69 | root 70 | |-- blob_id: string (nullable = false) 71 | |-- commit_hash: string (nullable = false) 72 | |-- repository_id: string (nullable = false) 73 | |-- reference_name: string (nullable = false) 74 | |-- content: binary (nullable = true) 75 | |-- is_binary: boolean (nullable = false) 76 | |-- path: string (nullable = true) 77 | ''' 78 | 79 | engine.repositories.references.commits.tree_entries.blobs.classify_languages().printSchema() 80 | ''' Output: 81 | root 82 | |-- blob_id: string (nullable = false) 83 | |-- commit_hash: string (nullable = false) 84 | |-- repository_id: string (nullable = false) 85 | |-- reference_name: string (nullable = false) 86 | |-- content: binary (nullable = true) 87 | |-- is_binary: boolean (nullable = false) 88 | |-- path: string (nullable = true) 89 | |-- lang: string (nullable = true) 90 | ''' 91 | 92 | engine.repositories.references.commits.tree_entries.blobs.classify_languages().extract_uasts().printSchema() 93 | ''' Output: 94 | root 95 | |-- blob_id: string (nullable = false) 96 | |-- commit_hash: string (nullable = false) 97 | |-- repository_id: string (nullable = false) 98 | |-- reference_name: string (nullable = false) 99 | |-- content: binary (nullable = true) 100 | |-- is_binary: boolean (nullable = false) 101 | |-- path: string (nullable = true) 102 | |-- lang: string (nullable = true) 103 | |-- uast: array (nullable = true) 104 | | |-- element: binary (containsNull = true) 105 | ''' 106 | ``` 107 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-uast-extraction.md: -------------------------------------------------------------------------------- 1 | ## Extracting UASTs example 2 | 3 | In the example code below, you can take a look to how the `extract_uasts()` method works. 4 | 5 | From the `engine` object instantiated in the spark-shell, a bunch of blobs are retrieving from the `HEAD` references from all the repositories and requesting for them. Once we have those blobs, we can call `extract_uasts()` which send the blobs to a [bblfsh server](https://github.com/bblfsh/server) to get back the UASTs. 6 | 7 | Finally, the `blob_id` , `path` and `uast` is showed on the table. 8 | 9 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 10 | ```sh 11 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 12 | ``` 13 | 14 | Code: 15 | ```python 16 | from sourced.engine import Engine 17 | engine = Engine(spark, '/path/to/siva-files', 'siva') 18 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().extract_uasts().select("blob_id", "path", "uast").show() 19 | 20 | ''' Output: 21 | +--------------------+--------------------+-------------+ 22 | | blob_id| path| uast| 23 | +--------------------+--------------------+-------------+ 24 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[[B@43efe672]| 25 | |7268016814b8ab7bc...| gcd/gcd.py|[[B@66938491]| 26 | |25dbfff34dcc8d252...| README.md| []| 27 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@51261a61]| 28 | |63bd495dce1d53092...|factorial/factori...|[[B@3163c734]| 29 | |bf17d9730e43f5697...| .travis.yml| []| 30 | |a697a655a7bfd6ba1...| prime/is_prime.py| [[B@d036b1c]| 31 | |76052f368f4c9c8de...|pythagorean_tripl...|[[B@774ec121]| 32 | |3be2253ba2e871d3b...|prime/is_prime_op...|[[B@16da28bb]| 33 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[[B@39af1733]| 34 | |7268016814b8ab7bc...| gcd/gcd.py|[[B@2f62c091]| 35 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[[B@2e245b95]| 36 | |4d3617f27e277e4b5...|differentiation/s...|[[B@697c211a]| 37 | |4d3617f27e277e4b5...|differentiation/s...|[[B@282bb589]| 38 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[[B@11f49e55]| 39 | |8ab978a56c5dcb239...|factorial/factori...|[[B@1d80870d]| 40 | |e35a52f431feac4b7...| abs/abs.py|[[B@157c0156]| 41 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@608e698d]| 42 | |51bdeff4494d60bb7...|euclidean/distanc...|[[B@55bd45ff]| 43 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[[B@4c1c08aa]| 44 | +--------------------+--------------------+-------------+ 45 | only showing top 20 rows 46 | ''' 47 | ``` 48 | -------------------------------------------------------------------------------- /_examples/pyspark/pyspark-shell-xpath-query.md: -------------------------------------------------------------------------------- 1 | ## Querying UASTs with XPath example 2 | 3 | You can see in this example how to make queries using [XPath syntax](https://www.w3.org/TR/xpath/) to retrieve valuable information from the UASTs. 4 | 5 | First we must use `extract_uasts()` method to request to a [bblfsh daemon](https://github.com/bblfsh/bblfshd) the UASTs. 6 | 7 | Then we can use the method `query_uast()` to get a result for the query we are formulating requesting tokens. This method takes in three parameters, the query, the column which contains the UASTs and the column that will be generated with the result. 8 | 9 | Finally, `extract_tokens()` method will generate a column `tokens` based on the previous generated column `result`. 10 | 11 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 12 | ```sh 13 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]" 14 | ``` 15 | 16 | Code: 17 | ```python 18 | from sourced.engine import Engine 19 | engine = Engine(spark, '/path/to/siva-files', 'siva') 20 | 21 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().where('lang = "Python"').extract_uasts().query_uast('//*[@roleIdentifier]').extract_tokens('result', 'tokens').select('blob_id', 'path', 'lang', 'uast', 'tokens').show() 22 | 23 | ''' Output: 24 | +--------------------+--------------------+------+-------------+--------------------+ 25 | | blob_id| path| lang| uast| tokens| 26 | +--------------------+--------------------+------+-------------+--------------------+ 27 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|Python|[[B@617b4738]|[fibonacci, n, in...| 28 | |7268016814b8ab7bc...| gcd/gcd.py|Python|[[B@2c66d0f9]|[math, gcd, a, in...| 29 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|Python|[[B@59c072af]|[math, is_prime, ...| 30 | |63bd495dce1d53092...|factorial/factori...|Python|[[B@45b32617]|[math, factorial,...| 31 | |a697a655a7bfd6ba1...| prime/is_prime.py|Python|[[B@7ecafb1e]|[is_prime, n, int...| 32 | |76052f368f4c9c8de...|pythagorean_tripl...|Python|[[B@64311d26]|[typing, List, ty...| 33 | |3be2253ba2e871d3b...|prime/is_prime_op...|Python|[[B@3e3e5e05]|[math, random, RA...| 34 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|Python|[[B@62e1544b]|[math, is_prime_o...| 35 | |7268016814b8ab7bc...| gcd/gcd.py|Python|[[B@4b5a5102]|[math, gcd, a, in...| 36 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|Python|[[B@27eead62]|[math, gcd_optima...| 37 | |4d3617f27e277e4b5...|differentiation/s...|Python|[[B@6b6c11ec]|[typing, Callable...| 38 | |4d3617f27e277e4b5...|differentiation/s...|Python| [[B@3c753c6]|[typing, Callable...| 39 | |6d7c6cb29abb52fc2...| gcd/gcd.py|Python|[[B@1a8cd0fd]|[gcd, a, int, b, ...| 40 | |8ab978a56c5dcb239...|factorial/factori...|Python|[[B@485beb73]|[factorial, n, in...| 41 | |e35a52f431feac4b7...| abs/abs.py|Python|[[B@43b370e5]|[math, abs, x, re...| 42 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|Python|[[B@7a534236]|[math, is_prime, ...| 43 | |51bdeff4494d60bb7...|euclidean/distanc...|Python| [[B@6246eb9]|[math, typing, Tu...| 44 | |6d7c6cb29abb52fc2...| gcd/gcd.py|Python|[[B@11b30d7d]|[gcd, a, int, b, ...| 45 | |e35a52f431feac4b7...| abs/abs.py|Python|[[B@495f63f6]|[math, abs, x, re...| 46 | |8ab978a56c5dcb239...|factorial/factori...|Python|[[B@297dca19]|[factorial, n, in...| 47 | +--------------------+--------------------+------+-------------+--------------------+ 48 | only showing top 20 rows 49 | ''' 50 | ``` 51 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-basic.md: -------------------------------------------------------------------------------- 1 | ## Basic example 2 | 3 | In this example, the spark-shell is used to show a simple usage of the source{d} jgit-spark-connector. 4 | 5 | First, you can see how to import the package and instantiate and object that provide all the methods to manipulate the data retrieved from repositories. 6 | 7 | The `engine` object is used to filter repositories by `id`, get the `HEAD` references from the repositories and look for the commits in that references which contain the word `Initial` in their messages. Then a table is showed selecting the columns `repository_id`, `hash` and `message`. 8 | 9 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 10 | ```sh 11 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 12 | ``` 13 | 14 | Code: 15 | ```scala 16 | import tech.sourced.engine._ 17 | 18 | val engine = Engine(spark, "/path/to/siva-files", "siva") 19 | engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getReferences.filter('name === "refs/heads/HEAD").getCommits.filter('message.contains("Initial")).select('repository_id, 'hash, 'message).show 20 | 21 | /* Output: 22 | +--------------------+--------------------+--------------+ 23 | | repository_id| hash| message| 24 | +--------------------+--------------------+--------------+ 25 | |github.com/mingra...|aac052c42c501abf6...|Initial commit| 26 | +--------------------+--------------------+--------------+ 27 | */ 28 | ``` 29 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-classifying-languages.md: -------------------------------------------------------------------------------- 1 | ## Classifying languages example 2 | 3 | This example uses the spark-shell to show how to classify blobs by their language with `classifyLanguages`. 4 | 5 | Making use of the `engine` object, it filters repositories by `id` to get all blobs from the `HEAD` references from them. After that, a call to `classifyLanguages` function detects the language for each file to show them in the aggregated column `lang` beside the selected columns `blob_id` and `path`. 6 | 7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```scala 14 | import tech.sourced.engine._ 15 | 16 | val engine = Engine(spark, "/path/to/siva-files", "siva") 17 | engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.select('blob_id, 'path, 'lang).show 18 | 19 | /* Output: 20 | +--------------------+--------------------+--------+ 21 | | blob_id| path| lang| 22 | +--------------------+--------------------+--------+ 23 | |ff4fa0794274a7ffb...|fibonacci/fibonac...| Python| 24 | |7268016814b8ab7bc...| gcd/gcd.py| Python| 25 | |25dbfff34dcc8d252...| README.md|Markdown| 26 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python| 27 | |63bd495dce1d53092...|factorial/factori...| Python| 28 | |bf17d9730e43f5697...| .travis.yml| YAML| 29 | |a697a655a7bfd6ba1...| prime/is_prime.py| Python| 30 | |76052f368f4c9c8de...|pythagorean_tripl...| Python| 31 | |3be2253ba2e871d3b...|prime/is_prime_op...| Python| 32 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...| Python| 33 | |7268016814b8ab7bc...| gcd/gcd.py| Python| 34 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...| Python| 35 | |4d3617f27e277e4b5...|differentiation/s...| Python| 36 | |4d3617f27e277e4b5...|differentiation/s...| Python| 37 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python| 38 | |8ab978a56c5dcb239...|factorial/factori...| Python| 39 | |e35a52f431feac4b7...| abs/abs.py| Python| 40 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python| 41 | |51bdeff4494d60bb7...|euclidean/distanc...| Python| 42 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python| 43 | +--------------------+--------------------+--------+ 44 | only showing top 20 rows 45 | */ 46 | ``` 47 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-lang-and-uast.md: -------------------------------------------------------------------------------- 1 | ## Classifying languages and extracting UASTs example 2 | 3 | The combined usage of both, `classifyLanguages` and `extractUASTs` methods, has the advantage that doesn't rely the language detection task on the [bblfsh server](https://github.com/bblfsh/server) , so you can save some time. 4 | 5 | To do that, you just have to call `extractUASTs` on a Dataframe where previously, `classifyLanguages` was used. 6 | 7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```scala 14 | import tech.sourced.engine._ 15 | 16 | val engine = Engine(spark, "/path/to/siva-files", "siva") 17 | engine.getRepositories.getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.extractUASTs.select('blob_id, 'path, 'lang, 'uast).show 18 | 19 | /* Output: 20 | +--------------------+--------------------+--------+-------------+ 21 | | blob_id| path| lang| uast| 22 | +--------------------+--------------------+--------+-------------+ 23 | |ff4fa0794274a7ffb...|fibonacci/fibonac...| Python|[[B@62f37a44]| 24 | |7268016814b8ab7bc...| gcd/gcd.py| Python|[[B@7c0368da]| 25 | |25dbfff34dcc8d252...| README.md|Markdown| []| 26 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python|[[B@7fa8bfe4]| 27 | |63bd495dce1d53092...|factorial/factori...| Python|[[B@3cad2dd4]| 28 | |bf17d9730e43f5697...| .travis.yml| YAML| []| 29 | |a697a655a7bfd6ba1...| prime/is_prime.py| Python|[[B@45f5415f]| 30 | |76052f368f4c9c8de...|pythagorean_tripl...| Python|[[B@22d7a483]| 31 | |3be2253ba2e871d3b...|prime/is_prime_op...| Python|[[B@18ba78a2]| 32 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...| Python|[[B@4dac25ec]| 33 | |7268016814b8ab7bc...| gcd/gcd.py| Python|[[B@223c6abf]| 34 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...| Python|[[B@3dd021c7]| 35 | |4d3617f27e277e4b5...|differentiation/s...| Python|[[B@76e431b7]| 36 | |4d3617f27e277e4b5...|differentiation/s...| Python|[[B@5a4bf9c2]| 37 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python|[[B@1be309a6]| 38 | |8ab978a56c5dcb239...|factorial/factori...| Python|[[B@2781dd04]| 39 | |e35a52f431feac4b7...| abs/abs.py| Python|[[B@70bf39ca]| 40 | |b2675a52ed6bfdfa9...|prime/is_prime_op...| Python|[[B@753f5bf6]| 41 | |51bdeff4494d60bb7...|euclidean/distanc...| Python|[[B@7612c2ce]| 42 | |6d7c6cb29abb52fc2...| gcd/gcd.py| Python|[[B@5f5248f5]| 43 | +--------------------+--------------------+--------+-------------+ 44 | only showing top 20 rows 45 | */ 46 | ``` 47 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-raw-repositories.md: -------------------------------------------------------------------------------- 1 | # Raw repositories usage 2 | 3 | In this example, the spark-shell is used to show the usage of source{d} jgit-spark-connector with raw git repositories. 4 | 5 | ## Differences with siva usage 6 | 7 | What are the main differences between using the jgit-spark-connector with siva files and raw git repositories? 8 | 9 | * Raw repositories can have non-remote references, siva files do not. 10 | * Even if you have only one repository, you may have N repositories in the output returned by the jgit-spark-connector. That's because different origins are treated as different repositories. In short, you'll have as many repositories as remotes in your repository plus one repository that corresponds to the local repository, which is identified by `file://$PATH_TO_REPOSITORY`. This one will always contain non-remote references and the rest of the repositories will always contain remote references. 11 | 12 | **Note:** raw repositories refer to `standard` and `bare` repositories. 13 | 14 | ## Getting repository references 15 | 16 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 17 | ```sh 18 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 19 | ``` 20 | 21 | So we can get the repositories like this and we can see that even if we only have one repository, jgit-spark-connector says we have two: 22 | 23 | ```scala 24 | import tech.sourced.engine._ 25 | val engine = Engine(spark, "/path/to/repositories", "standard") 26 | println(engine.getRepositories.count()) 27 | 28 | // Output: 29 | // 2 30 | ``` 31 | 32 | Getting references: 33 | 34 | ```scala 35 | print(engine.repositories.references.count()) 36 | 37 | // Output: 38 | // 4 39 | ``` 40 | 41 | If you want a behavior that's more similar to siva files usage you can filter out non-remote references: 42 | 43 | ```scala 44 | val references = engine.getRepositories.getReferences 45 | println(references.filter(references("is_remote") === true).count()) 46 | 47 | // Output: 48 | // 2 49 | ``` 50 | 51 | Alternately, you can use the following shorthand: 52 | 53 | ```scala 54 | print(engine.getRepositories.getRemoteReferences.count()) 55 | 56 | // Output: 57 | // 2 58 | ``` 59 | 60 | ### Caveats 61 | 62 | Note that even if in your repository there's a reference named `refs/remotes/origin/master` it will be converted to a reference named `refs/heads/master` that belongs to the repository identified by your origin remote URL. 63 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-schemas.md: -------------------------------------------------------------------------------- 1 | ## Printing schema example 2 | 3 | The next example showed, just try to show the simple usage of the useful method `printSchema`. 4 | 5 | It can help you to follow the aggregated or pruned information that your transformations make on the data you are handling. 6 | 7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 8 | ```sh 9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 10 | ``` 11 | 12 | Code: 13 | ```scala 14 | import tech.sourced.engine._ 15 | 16 | val engine = Engine(spark, "/path/to/siva-files", "siva") 17 | engine.getRepositories.printSchema 18 | /* Output: 19 | root 20 | |-- id: string (nullable = false) 21 | |-- urls: array (nullable = false) 22 | | |-- element: string (containsNull = false) 23 | |-- is_fork: boolean (nullable = true) 24 | */ 25 | 26 | engine.getRepositories.getReferences.printSchema 27 | /* Output: 28 | root 29 | |-- repository_id: string (nullable = false) 30 | |-- name: string (nullable = false) 31 | |-- hash: string (nullable = false) 32 | |-- is_remote: boolean (nullable = false) 33 | */ 34 | 35 | engine.getRepositories.getReferences.getCommits.printSchema 36 | /* also engine.getRepositories.getReferences.getAllReferenceCommits.printSchema */ 37 | /* Output: 38 | root 39 | |-- repository_id: string (nullable = false) 40 | |-- reference_name: string (nullable = false) 41 | |-- index: integer (nullable = false) 42 | |-- hash: string (nullable = false) 43 | |-- message: string (nullable = false) 44 | |-- parents: array (nullable = true) 45 | | |-- element: string (containsNull = false) 46 | |-- parents_count: integer (nullable = false) 47 | |-- author_email: string (nullable = true) 48 | |-- author_name: string (nullable = true) 49 | |-- author_date: timestamp (nullable = true) 50 | |-- committer_email: string (nullable = true) 51 | |-- committer_name: string (nullable = true) 52 | |-- committer_date: timestamp (nullable = true) 53 | */ 54 | 55 | engine.getRepositories.getReferences.getCommits.getTreeEntries.printSchema 56 | /* Output: 57 | root 58 | |-- commit_hash: string (nullable = false) 59 | |-- repository_id: string (nullable = false) 60 | |-- reference_name: string (nullable = false) 61 | |-- blob: string (nullable = true) 62 | */ 63 | 64 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.printSchema 65 | /* Output: 66 | root 67 | |-- blob_id: string (nullable = false) 68 | |-- commit_hash: string (nullable = false) 69 | |-- repository_id: string (nullable = false) 70 | |-- reference_name: string (nullable = false) 71 | |-- content: binary (nullable = true) 72 | |-- is_binary: boolean (nullable = false) 73 | |-- path: string (nullable = true) 74 | */ 75 | 76 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.classifyLanguages.printSchema 77 | /* Output: 78 | root 79 | |-- blob_id: string (nullable = false) 80 | |-- commit_hash: string (nullable = false) 81 | |-- repository_id: string (nullable = false) 82 | |-- reference_name: string (nullable = false) 83 | |-- content: binary (nullable = true) 84 | |-- is_binary: boolean (nullable = false) 85 | |-- path: string (nullable = true) 86 | |-- lang: string (nullable = true) 87 | */ 88 | 89 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.classifyLanguages.extractUASTs.printSchema 90 | /* Output: 91 | root 92 | |-- blob_id: string (nullable = false) 93 | |-- commit_hash: string (nullable = false) 94 | |-- repository_id: string (nullable = false) 95 | |-- reference_name: string (nullable = false) 96 | |-- content: binary (nullable = true) 97 | |-- is_binary: boolean (nullable = false) 98 | |-- path: string (nullable = true) 99 | |-- lang: string (nullable = true) 100 | |-- uast: array (nullable = true) 101 | | |-- element: binary (containsNull = true) 102 | */ 103 | ``` 104 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-uast-extraction.md: -------------------------------------------------------------------------------- 1 | ## Extracting UASTs example 2 | 3 | In the example code below, you can take a look to how the `extractUASTs` method works. 4 | 5 | From the `engine` object instantiated in the spark-shell, a bunch of blobs has been got filtering repositories by `id`, retrieving their `HEAD` references and requesting for them. Once we have that blobs, we can call `extractUASTs` which send the blobs to a [bblfsh server](https://github.com/bblfsh/server) to get back the UASTs. 6 | 7 | Finally, the `blob_id`, file `path` and `uast` is showed on the table. 8 | 9 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 10 | ```sh 11 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 12 | ``` 13 | 14 | ```scala 15 | import tech.sourced.engine._ 16 | 17 | val engine = Engine(spark, "/path/to/siva-files", "siva") 18 | val exampleDf = engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getHEAD.getCommits.getTreeEntries.getBlobs.extractUASTs.select('blob_id, 'path, 'uast) 19 | 20 | exampleDf.show 21 | 22 | /* Output: 23 | +--------------------+--------------------+-------------+ 24 | | blob_id| path| uast| 25 | +--------------------+--------------------+-------------+ 26 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[[B@5e53daf6]| 27 | |7268016814b8ab7bc...| gcd/gcd.py|[[B@65f08242]| 28 | |25dbfff34dcc8d252...| README.md| []| 29 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@7d81ce6a]| 30 | |63bd495dce1d53092...|factorial/factori...|[[B@4c903df9]| 31 | |bf17d9730e43f5697...| .travis.yml| []| 32 | |a697a655a7bfd6ba1...| prime/is_prime.py| [[B@cd4caf7]| 33 | |76052f368f4c9c8de...|pythagorean_tripl...|[[B@6d57bbbd]| 34 | |3be2253ba2e871d3b...|prime/is_prime_op...|[[B@1ed6dae3]| 35 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[[B@53e45335]| 36 | |7268016814b8ab7bc...| gcd/gcd.py|[[B@79cda8cc]| 37 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[[B@29976e1b]| 38 | |4d3617f27e277e4b5...|differentiation/s...| [[B@13ea808]| 39 | |4d3617f27e277e4b5...|differentiation/s...|[[B@70323ee1]| 40 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[[B@642d63e3]| 41 | |8ab978a56c5dcb239...|factorial/factori...|[[B@76583ecb]| 42 | |e35a52f431feac4b7...| abs/abs.py| [[B@252b6e0]| 43 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@63f6557d]| 44 | |51bdeff4494d60bb7...|euclidean/distanc...|[[B@6ccb009b]| 45 | |6d7c6cb29abb52fc2...| gcd/gcd.py|[[B@5b52d5af]| 46 | +--------------------+--------------------+-------------+ 47 | only showing top 20 rows 48 | */ 49 | ``` 50 | -------------------------------------------------------------------------------- /_examples/scala/spark-shell-xpath-query.md: -------------------------------------------------------------------------------- 1 | ## Querying UASTs with XPath example 2 | 3 | You can see in this example how to make queries using [XPath syntax](https://www.w3.org/TR/xpath/) to retrieve valuable information from the UASTs. 4 | 5 | First we must use `extractUASTs` method to request to a [bblfsh daemon](https://github.com/bblfsh/bblfshd) the UASTs. 6 | 7 | Then we can use the method `queryUAST` to get a result for the query we are formulating requesting tokens. This method takes in three parameters, the query, the column which contains the UASTs and the column that will be generated with the result. 8 | 9 | Finally, `extractTokens` method will generate a column `tokens` based on the previous generated column `result`. 10 | 11 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced): 12 | ```sh 13 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]" 14 | ``` 15 | 16 | Code: 17 | ```scala 18 | import tech.sourced.engine._ 19 | 20 | val engine = Engine(spark, "/path/to/siva-files", "siva") 21 | engine.getRepositories.getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.where('lang === "Python").extractUASTs.queryUAST("//*[@roleIdentifier]", "uast", "result").extractTokens("result", "tokens").select('path, 'lang, 'uast, 'tokens).show 22 | 23 | /* Output: 24 | +--------------------+------+-------------+--------------------+ 25 | | path| lang| uast| tokens| 26 | +--------------------+------+-------------+--------------------+ 27 | |fibonacci/fibonac...|Python|[[B@466c4700]|[fibonacci, n, in...| 28 | | gcd/gcd.py|Python|[[B@22a4508c]|[math, gcd, a, in...| 29 | |prime/is_prime_op...|Python|[[B@6772d8f3]|[math, is_prime, ...| 30 | |factorial/factori...|Python| [[B@86bff75]|[math, factorial,...| 31 | | prime/is_prime.py|Python|[[B@2c1bed3f]|[is_prime, n, int...| 32 | |pythagorean_tripl...|Python|[[B@2cbbf800]|[typing, List, ty...| 33 | |prime/is_prime_op...|Python|[[B@5d7f1824]|[math, random, RA...| 34 | |prime/is_prime_op...|Python| [[B@ab8c4a9]|[math, is_prime_o...| 35 | | gcd/gcd.py|Python|[[B@7939b2d4]|[math, gcd, a, in...| 36 | |gcd/gcd_optimal_e...|Python| [[B@a313e0b]|[math, gcd_optima...| 37 | |differentiation/s...|Python|[[B@2faab951]|[typing, Callable...| 38 | |differentiation/s...|Python|[[B@637bad81]|[typing, Callable...| 39 | | gcd/gcd.py|Python|[[B@57601c28]|[gcd, a, int, b, ...| 40 | |factorial/factori...|Python|[[B@5422a1a9]|[factorial, n, in...| 41 | | abs/abs.py|Python|[[B@2e38fa4d]|[math, abs, x, re...| 42 | |prime/is_prime_op...|Python|[[B@10914dae]|[math, is_prime, ...| 43 | |euclidean/distanc...|Python|[[B@47c782c8]|[math, typing, Tu...| 44 | | gcd/gcd.py|Python| [[B@6a94c70]|[gcd, a, int, b, ...| 45 | | abs/abs.py|Python|[[B@6faa347a]|[math, abs, x, re...| 46 | |factorial/factori...|Python|[[B@754ce81c]|[factorial, n, in...| 47 | +--------------------+------+-------------+--------------------+ 48 | only showing top 20 rows 49 | */ 50 | ``` 51 | -------------------------------------------------------------------------------- /_examples/siva-files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva -------------------------------------------------------------------------------- /_examples/siva-files/5d4a8bf30c0da7209f651632b62a362620556c85.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/5d4a8bf30c0da7209f651632b62a362620556c85.siva -------------------------------------------------------------------------------- /_examples/siva-files/aac052c42c501abf6aa8c3509424e837bb27e188.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/aac052c42c501abf6aa8c3509424e837bb27e188.siva -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import java.nio.file.{Files, StandardCopyOption} 2 | 3 | import Dependencies.{scalaTest, _} 4 | import sbt.Keys.libraryDependencies 5 | 6 | organization := "tech.sourced" 7 | scalaVersion := "2.11.11" 8 | name := "jgit-spark-connector" 9 | 10 | git.useGitDescribe := true 11 | enablePlugins(GitVersioning) 12 | 13 | libraryDependencies += scalaTest % Test 14 | libraryDependencies += scoverage % Test 15 | libraryDependencies += sparkSql % Provided 16 | libraryDependencies += newerHadoopClient % Provided //due to newer v. of guava in bblfsh 17 | // grpc for bblfsh/client-scala needs to be newer than in Spark 18 | libraryDependencies += fixNettyForGrpc 19 | libraryDependencies += jgit % Compile 20 | libraryDependencies += siva % Compile 21 | libraryDependencies += bblfsh % Compile 22 | libraryDependencies += commonsIO % Compile 23 | libraryDependencies += commonsPool % Compile 24 | libraryDependencies += enry % Compile 25 | libraryDependencies += scalaLib % Provided 26 | libraryDependencies += sqlite % Compile 27 | libraryDependencies += sqlite % Test 28 | libraryDependencies += metrics % Compile 29 | 30 | testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest, "-oUT") 31 | 32 | test in assembly := {} 33 | assemblyJarName in assembly := s"${name.value}-${version.value}.jar" 34 | 35 | parallelExecution in Test := false 36 | logBuffered in Test := false 37 | 38 | // Shade everything but tech.sourced.engine so the user does not have conflicts 39 | assemblyShadeRules in assembly := Seq( 40 | ShadeRule.rename("com.google.common.**" -> 41 | "tech.sourced.engine.shaded.com.google.common.@1").inAll, 42 | ShadeRule.rename("com.google.protobuf.**" -> 43 | "tech.sourced.engine.shaded.com.google.protobuf.@1").inAll, 44 | ShadeRule.rename("io.netty.**" -> 45 | "tech.sourced.engine.shaded.io.netty.@1").inAll 46 | ) 47 | 48 | assemblyMergeStrategy in assembly := { 49 | case "META-INF/io.netty.versions.properties" => MergeStrategy.last 50 | case x => 51 | val oldStrategy = (assemblyMergeStrategy in assembly).value 52 | oldStrategy(x) 53 | } 54 | 55 | sonatypeProfileName := "tech.sourced" 56 | 57 | // pom settings for sonatype 58 | homepage := Some(url("https://github.com/src-d/jgit-spark-connector")) 59 | scmInfo := Some(ScmInfo(url("https://github.com/src-d/jgit-spark-connector"), 60 | "git@github.com:src-d/jgit-spark-connector.git")) 61 | developers += Developer("ajnavarro", 62 | "Antonio Navarro", 63 | "antonio@sourced.tech", 64 | url("https://github.com/ajnavarro")) 65 | developers += Developer("bzz", 66 | "Alexander Bezzubov", 67 | "alex@sourced.tech", 68 | url("https://github.com/bzz")) 69 | developers += Developer("mcarmonaa", 70 | "Manuel Carmona", 71 | "manuel@sourced.tech", 72 | url("https://github.com/mcarmonaa")) 73 | developers += Developer("erizocosmico", 74 | "Miguel Molina", 75 | "miguel@sourced.tech", 76 | url("https://github.com/erizocosmico")) 77 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0")) 78 | pomIncludeRepository := (_ => false) 79 | 80 | crossPaths := false 81 | publishMavenStyle := true 82 | 83 | val SONATYPE_USERNAME = scala.util.Properties.envOrElse("SONATYPE_USERNAME", "NOT_SET") 84 | val SONATYPE_PASSWORD = scala.util.Properties.envOrElse("SONATYPE_PASSWORD", "NOT_SET") 85 | credentials += Credentials( 86 | "Sonatype Nexus Repository Manager", 87 | "oss.sonatype.org", 88 | SONATYPE_USERNAME, 89 | SONATYPE_PASSWORD) 90 | 91 | val SONATYPE_PASSPHRASE = scala.util.Properties.envOrElse("SONATYPE_PASSPHRASE", "not set") 92 | 93 | useGpg := false 94 | pgpSecretRing := baseDirectory.value / "project" / ".gnupg" / "secring.gpg" 95 | pgpPublicRing := baseDirectory.value / "project" / ".gnupg" / "pubring.gpg" 96 | pgpPassphrase := Some(SONATYPE_PASSPHRASE.toArray) 97 | 98 | packageBin in Compile := { 99 | val file = (packageBin in Compile).value 100 | val dest = new java.io.File(file.getParent, s"${name.value}-${version.value}-slim.jar") 101 | Files.copy( 102 | new java.io.File(file.getAbsolutePath).toPath, 103 | dest.toPath, 104 | StandardCopyOption.REPLACE_EXISTING 105 | ) 106 | Files.delete(file.toPath) 107 | dest 108 | } 109 | 110 | publishArtifact in (Compile, packageBin) := false 111 | 112 | val packageSlim = taskKey[File]("package-slim") 113 | 114 | packageSlim := (packageBin in Compile).value 115 | 116 | addArtifact(Artifact("jgit-spark-connector", "jar", "jar", "slim"), packageSlim) 117 | 118 | assembly := { 119 | val file = assembly.value 120 | val dest = new java.io.File(file.getParent, s"${name.value}-uber.jar") 121 | Files.copy( 122 | new java.io.File(file.getAbsolutePath).toPath, 123 | dest.toPath, 124 | StandardCopyOption.REPLACE_EXISTING 125 | ) 126 | file 127 | } 128 | 129 | assembly := assembly.dependsOn(packageBin in Compile).value 130 | 131 | addArtifact(artifact in(Compile, assembly), assembly) 132 | 133 | isSnapshot := version.value endsWith "SNAPSHOT" 134 | 135 | publishTo := { 136 | val nexus = "https://oss.sonatype.org/" 137 | if (isSnapshot.value) { 138 | Some("snapshots" at nexus + "content/repositories/snapshots") 139 | } else { 140 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /documentation/proposals/ENIP-000.md: -------------------------------------------------------------------------------- 1 | ENIP Template 2 | 3 | | Field | Value | 4 | | --- | --- | 5 | | ENIP | 0 | 6 | | Title | ENIP Template | 7 | | Author | Antonio Navarro | 8 | | Status | Accepted | 9 | | Created | 2017-05-26 | 10 | | Updated | 2017-09-13 | 11 | | Target version | optional | 12 | 13 | ## Abstract 14 | 15 | A short description of the technical issue being addressed. 16 | 17 | ## Rationale 18 | 19 | Proposal motivation. 20 | 21 | ## Specification 22 | 23 | Technical specification of the changes proposed. 24 | 25 | ## Alternatives 26 | 27 | How the issue is currently solved or can be solved if this change is not accepted. 28 | 29 | ## Impact 30 | 31 | How this change would impact jgit-spark-connector functionality: backward compatibility broken, performance improvements or issues, corner cases and so on. 32 | 33 | ## References 34 | 35 | Links to additional documentation describing related features or other kind of related information. 36 | -------------------------------------------------------------------------------- /documentation/proposals/ENIP-001.md: -------------------------------------------------------------------------------- 1 | # Pre-compute repository metadata and save it to another DataSource 2 | 3 | | Field | Value | 4 | | --- | --- | 5 | | ENIP | 1 | 6 | | Title | Pre-compute repository metadata and save it to another DataSource | 7 | | Author | Antonio Navarro | 8 | | Status | Rejected | 9 | | Created | 2017-11-14 | 10 | | Updated | 2017-11-21 | 11 | | Target version | - | 12 | 13 | ## Abstract 14 | 15 | With this change we want to improve the performance of reading repositories metadata, 16 | saving that metadata in other DataSource than GitDataSource. 17 | It can be any of the already implemented ones (json,parquet,jdbc and so on). 18 | 19 | ## Rationale 20 | 21 | Reading the content of siva files over and over again is not really performant. 22 | With this ENIP we want a way to improve speed reading metadata (repositories, references, commits, and tree entries). 23 | 24 | To do that, 25 | we are going to add new methods on the api using the already existing methods on DataFrame API, 26 | [reader][1] and [writer][2]. 27 | 28 | ## Specification 29 | To be able to register other datasource than GitDataSource, we should change a bit the way that we are geting the datasources to process commits, references, or blobs. 30 | 31 | Actually we are registering datasources using `getDatasource` method: 32 | 33 | ```scala 34 | /** 35 | * Returns a [[org.apache.spark.sql.DataFrame]] for the given table using the provided 36 | * [[org.apache.spark.sql.SparkSession]]. 37 | * 38 | * @param table name of the table 39 | * @param session spark session 40 | * @return dataframe for the given table 41 | */ 42 | private[engine] def getDataSource(table: String, session: SparkSession): DataFrame = 43 | session.read.format("tech.sourced.engine.DefaultSource") 44 | .option("table", table) 45 | .load(session.sqlContext.getConf(repositoriesPathKey)) 46 | ``` 47 | 48 | Instead of this, we can create a view using the SparkSession from several datasources: 49 | 50 | ```scala 51 | /** 52 | * Creates a local temporary view using the given name. The lifetime of this 53 | * temporary view is tied to the [[SparkSession]] that was used to create this Dataset. 54 | * 55 | * @group basic 56 | * @since 2.0.0 57 | */ 58 | def createOrReplaceTempView(viewName: String): Unit = withPlan { 59 | createTempViewCommand(viewName, replace = true, global = false) 60 | } 61 | ``` 62 | 63 | `createOrReplaceTempView` method will allow us to register tables at the engine instantiation with several datasources. 64 | Then, from implicit DataFrame methods, we can do: 65 | 66 | ```scala 67 | val commitsDf = df.sparkSession.table("commits") 68 | ``` 69 | 70 | Instead of: 71 | ```scala 72 | val commitsDf = getDataSource("commits", df.sparkSession) 73 | ``` 74 | 75 | Then, the list of needed changes on the Engine API are: 76 | - Initialize GitDataSource views at Engine initialization 77 | - Add method `backMetadataToSource(options)` (name to decide) into the Engine API. 78 | - Add method `fromMetadataSource(options)` (name to decide) into the Engine API. 79 | That method will change all the default registered views to the specified DataSource. 80 | 81 | We should check speed improvement with a substantial amount of repositories and several DataSources. 82 | 83 | ## Alternatives 84 | Using the already existing Spark Dataframe API, 85 | we can save that metadata. 86 | 87 | Example: 88 | ```scala 89 | repositoriesDf.write.bucketBy(100,"repository_url").parquet("repositories.parquet") 90 | // or 91 | repositoriesDf.write.jdbc(url, tableName, properties) 92 | 93 | ``` 94 | 95 | And then read it using `SparkSession.read` method. 96 | 97 | ## Impact 98 | 99 | The actual Join Rule optimization for Git Datasources will not be applied. 100 | That means, 101 | if we do a Join between two jdbc datasources table, 102 | the Join will be executed at Spark level, 103 | doing a full scan on both jdbc tables. 104 | That can works really well with small amount of repositories and siva files, 105 | but if we want an Engine as scalable as Spark, we should avoid this kind of operations. 106 | 107 | ## References 108 | 109 | [DataFrameReader API][1] 110 | 111 | [DataFrameWriter API][2] 112 | 113 | [1]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrameReader 114 | [2]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrameWriter 115 | -------------------------------------------------------------------------------- /documentation/proposals/ENIP-002.md: -------------------------------------------------------------------------------- 1 | # Use Borges DB as metadata on Engine instead of config files inside siva files. 2 | 3 | | Field | Value | 4 | | --- | --- | 5 | | ENIP | 2 | 6 | | Title | Use Borges DB as metadata on Engine instead of config files inside siva files. | 7 | | Author | Antonio Navarro | 8 | | Status | Rejected | 9 | | Created | 2017-11-15 | 10 | | Updated | 2017-11-21 | 11 | | Target version | 0.X | 12 | 13 | ## Abstract 14 | 15 | In our current pipeline, 16 | the element that is in charge of fetch and organize repositories over root-repositories ([Borges][2]) is already creating a database with a lot of useful metadata about the repositories. 17 | With this ENIP we want to use that metadata to improve Engine performance. 18 | 19 | ## Rationale 20 | 21 | Borges is creating a new row per repository fetched. 22 | The actual schema is: 23 | 24 | |Column|Type|Description| 25 | |---|---|---| 26 | |ID|ULID|Unique ULID for a repository | 27 | |CreatedAt|Timestamp|| 28 | |UpdatedAt|Timestamp|| 29 | |Endpoints|Array[String]|Endpoints is a slice of valid git endpoints to reach this repository. For example, git://host/my/repo.git and https://host/my/repo.git. They are meant to be endpoints of the same exact repository, and not mirrors.| 30 | |FetchStatus|String|Actual status of the repository, it can be "not_found", "fetched", "pending" and "fetching"| 31 | |FetchedAt|Timestamp|FetchedAt is the timestamp of the last time this repository was fetched and archived in our repository storage successfully.| 32 | |FetchedErrorAt|Timestamp|FetchErrorAt is the timestamp of the last fetch error, if any.| 33 | |LastCommitAt|Timestamp|LastCommitAt is the last commit time found in this repository.| 34 | |References|JsonB|References is the current slice of references as present in our repository storage.| 35 | |IsFork|Boolean|IsFork stores if this repository is a fork or not. It can be nil if we don't know.| 36 | 37 | The content of the References Json is: 38 | 39 | |Column|Type|Description| 40 | |---|---|---| 41 | |Name|String|Name is the full reference name.| 42 | |Hash|Array[Byte]|Hash is the hash of the reference.| 43 | |Init|Array[Byte]|Init is the hash of the init commit reached from this reference.| 44 | |Roots|Array[Array[Byte]]|Roots is a slice of the hashes of all root commits reachable from this reference.| 45 | |Time|Timestamp|Time is the time of the commit this reference points too.| 46 | 47 | The JDBC connector returns [json and jsonb types as String][1], 48 | so we should apply a defined function to parse as a StructType to be able to query internal content. 49 | 50 | ## Specification 51 | 52 | Create new method on the Engine that able us to register the "repositories" and "references" tables as views from a JDBC datasource. 53 | We should call that methods something related with Borges because this functionality is heavily sticked to it. 54 | Example: `fromBorgesMetadata(options)` 55 | 56 | Because of "references" is one of the columns of the "repositories" table, 57 | we should create a view from a query that applies the `from_json()` function, 58 | and expand the result to make a new table with all the reference elements of the arrays. 59 | 60 | As first approach, the actual table schemas will be preserved and data mapped to that schema. 61 | Actually, the schema can be modified if the main columns, 62 | the ones used to join data between tables, 63 | are preserved. 64 | 65 | The "repositories" view will filter all repositories that are not in *fetched* status to avoid consistency problems with the existing rooted-repositories. 66 | 67 | We also need to check if the names for special references (HEAD and master) are specified at the same way as they are specified into rooted-repositories. 68 | 69 | The actual logic to generate the repository id on the Engine will be reused to get that data from Borges table. 70 | 71 | ## Alternatives 72 | 73 | The actual Spark API allows us to create a DataFrame from any JDBC connection. 74 | 75 | ## Impact 76 | 77 | New method on the Engine to use Borges views instead of standard ones. 78 | 79 | ## References 80 | - [Borges][2] 81 | - [json and jsonb types are returned as String][1] 82 | 83 | [1]: https://github.com/apache/spark/blob/0c0ad436ad909364915b910867d08262c62bc95d/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L58 84 | [2]: https://github.com/src-d/borges 85 | -------------------------------------------------------------------------------- /documentation/proposals/ENIP-004.md: -------------------------------------------------------------------------------- 1 | | Field | Value | 2 | | --- | --- | 3 | | ENIP | 4 | 4 | | Title | Get only first reference commit by default | 5 | | Author | Miguel Molina | 6 | | Status | Accepted | 7 | | Created | 2018-01-10 | 8 | | Updated | 2018-01-10 | 9 | | Target version | `0.4.x` | 10 | 11 | ## Abstract 12 | 13 | The purpose of this proposal is to make the default behavior of the engine to only get the first reference commit (aka the current state of that reference) and only get all reference commits when explicitly asked using a method `getAllReferenceCommits`. 14 | 15 | ## Rationale 16 | 17 | The rationale behind this is that most of the time what you want is `getFirstReferenceCommit`, which is the obvious thing a person would expect engine to do. If you want the more detailed output provided by `getAllReferenceCommits` you are opting in into this, which you already know will severely harm the performance of the job you're running. 18 | 19 | So, this as a default that makes more sense for the following reasons: 20 | 21 | * More obvious behavior for newcomers. 22 | * Default that does not severely impact performance. 23 | * Previous effect can still be achieved. 24 | 25 | ## Specification 26 | 27 | While this may seem like a very simple and easy issue, it is not as simple once one takes a deep look at how the engine queries are built. Right now, `getFirstReferenceCommit` adds a simple `index = 0` filter, but that's the opt-in behavior. If we want to make it the default, we need to make this change at the iterator level and not adding filters on the query, because then `getAllReferenceCommits` would not be able to remove the filter node. 28 | 29 | - `CommitIterator` needs to be changed to just get the first commit if no `index` filter is provided or and get N commits if it is. 30 | - `getAllReferenceCommits` can't provide a single number to match, like `index = 0`, it would need to do the following: `index >= 0`, which would require the `EqualThanOrEqual` filter. 31 | - Implement `EqualThanOrEqual` filter, which requires a complete refactor of the filters, which now work only for equality, providing a list of values that match the given filter instead of returning a function that can be evaluated inside the iterators. 32 | 33 | 34 | ## Alternatives 35 | 36 | As an alternative, instead of `index >= 0` one could use `index <> -1`, which would work with the current filters implementation, thus making the implementation of this proposal easier, although hackier. 37 | 38 | I would suggest that the first approach is taken, so that we can pave the road to include all possible filter nodes, which would make our iterators more efficient in filtering data without relying on spark to do that. 39 | 40 | ## Impact 41 | 42 | This change breaks compatibility with all prior versions because it changes the output produced by the engine, so it should be released as a `0.4.x` version. 43 | 44 | ## References 45 | 46 | n/a 47 | -------------------------------------------------------------------------------- /documentation/proposals/README.md: -------------------------------------------------------------------------------- 1 | # Engine Improvement Proposals 2 | 3 | ## Introduction 4 | 5 | This is the index of Engine Improvement Proposals, known as ENIPs. 6 | 7 | ## All Proposals by Number 8 | 9 | | Number | Status | Title | 10 | | ------ | -------- |----------------------------------------------------------------------| 11 | | 0 | Accepted | [ENIP Template](ENIP-000.md)| 12 | | 1 | Rejected | [Pre-compute repository metadata and save it to another DataSource](ENIP-001.md)| 13 | | 2 | Rejected | [Use Borges DB as metadata on Engine instead of config files inside siva files.](ENIP-002.md)| 14 | | 3 | Draft | [Local SQLite database per worker for on-demand metadata storage](ENIP-003.md)| 15 | -------------------------------------------------------------------------------- /key.asc.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/key.asc.enc -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | lazy val sparkVersion: String = sys.props.get("spark.version") 5 | .getOrElse("2.2.1") 6 | 7 | lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.1" 8 | lazy val scoverage = "org.scoverage" %% "scalac-scoverage-plugin" % "1.3.1" 9 | lazy val sparkSql = "org.apache.spark" %% "spark-sql" % sparkVersion 10 | lazy val newerHadoopClient = "org.apache.hadoop" % "hadoop-client" % "2.7.2" 11 | lazy val fixNettyForGrpc = "io.netty" % "netty-all" % "4.1.17.Final" 12 | lazy val jgit = "org.eclipse.jgit" % "org.eclipse.jgit" % "4.9.0.201710071750-r" 13 | lazy val siva = "tech.sourced" % "siva-java" % "0.1.3" 14 | lazy val bblfsh = "org.bblfsh" % "bblfsh-client" % "1.9.1" 15 | lazy val enry = "tech.sourced" % "enry-java" % "1.6.3" 16 | lazy val commonsIO = "commons-io" % "commons-io" % "2.5" 17 | lazy val commonsPool = "org.apache.commons" % "commons-pool2" % "2.4.3" 18 | lazy val scalaLib = "org.scala-lang" % "scala-library" % "2.11.11" 19 | lazy val sqlite = "org.xerial" % "sqlite-jdbc" % "3.21.0" 20 | lazy val metrics = "com.groupon.dse" % "spark-metrics" % "2.0.0" 21 | } 22 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.13 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") 4 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.2") 6 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1") 8 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.9.3") 9 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.egg-info 3 | dist 4 | build 5 | *.log 6 | jars 7 | *.pyc 8 | metastore_db 9 | *.zip -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md version.txt 2 | recursive-include jars *.jar 3 | -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- 1 | ENGINE_UBER_JAR = jgit-spark-connector-uber.jar 2 | ENGINE_UBER_JAR_LOCATION = ../target/$(ENGINE_UBER_JAR) 3 | JARS_DIR = jars 4 | 5 | 6 | $(JARS_DIR): 7 | mkdir -p $(JARS_DIR) 8 | 9 | .PHONY: test clean 10 | test: clean $(JARS_DIR) 11 | cp $(ENGINE_UBER_JAR_LOCATION) $(JARS_DIR) && \ 12 | python -m unittest discover -v -s ./test -t . 13 | 14 | clean: 15 | if [ -d $(JARS_DIR) ] ; \ 16 | then \ 17 | rm -r $(JARS_DIR) ; \ 18 | fi 19 | -------------------------------------------------------------------------------- /python/README.rst: -------------------------------------------------------------------------------- 1 | jgit-spark-connector 2 | ---------------------------- 3 | 4 | Python wrapper of the jgit-spark-connector to perform analysis on top of source code. -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | from setuptools import setup 3 | import os 4 | from os.path import exists, join, dirname, realpath 5 | 6 | CURR_DIR = dirname(realpath(__file__)) 7 | VERSION_FILE = join(CURR_DIR, "version.txt") 8 | README_FILE = join(CURR_DIR, "README.rst") 9 | 10 | if exists(VERSION_FILE): 11 | with open(VERSION_FILE, 'r') as f: 12 | __version__ = f.read().strip() 13 | else: 14 | __version__ = 'local' 15 | 16 | with open(README_FILE, 'r') as f: 17 | README = f.read() 18 | 19 | setup( 20 | name="sourced-jgit-spark-connector", 21 | description="Engine to use Spark on top of source code repositories.", 22 | long_description=README, 23 | version=__version__, 24 | license="Apache-2.0", 25 | author="source{d}", 26 | author_email="hello@sourced.tech", 27 | url="https://github.com/src-d/jgit-spark-connector/tree/master/python", 28 | packages=['sourced.engine'], 29 | namespace_packages=['sourced'], 30 | install_requires=[ 31 | "pyspark==" + os.environ.get('SPARK_VERSION', "2.2.1"), 32 | "bblfsh==2.9.13" 33 | ], 34 | classifiers=[ 35 | "Development Status :: 2 - Pre-Alpha", 36 | "Intended Audience :: Developers", 37 | "License :: OSI Approved :: Apache Software License", 38 | "Topic :: Scientific/Engineering :: Information Analysis", 39 | "Programming Language :: Python :: 2.7", 40 | "Programming Language :: Python :: 3" 41 | ] 42 | ) 43 | -------------------------------------------------------------------------------- /python/sourced/__init__.py: -------------------------------------------------------------------------------- 1 | # You must not include any other code and data in a namespace package's __init__.py 2 | import pkg_resources 3 | pkg_resources.declare_namespace(__name__) 4 | -------------------------------------------------------------------------------- /python/sourced/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from importlib import import_module 3 | from bblfsh.sdkversion import VERSION 4 | 5 | from sourced.engine.engine import Engine, SourcedDataFrame 6 | 7 | def parse_uast_node(data): 8 | """ 9 | Parses a byte array and turns it into an UAST node. 10 | 11 | >>> parse_uast_node(row["uast"]) 12 | 13 | :param data: binary-encoded uast as a byte array 14 | :type data: byte array 15 | :rtype: UAST node 16 | """ 17 | return import_module( 18 | "bblfsh.gopkg.in.bblfsh.sdk.%s.uast.generated_pb2" % VERSION)\ 19 | .Node.FromString(data) -------------------------------------------------------------------------------- /python/sourced/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/python/sourced/examples/__init__.py -------------------------------------------------------------------------------- /python/sourced/examples/basic.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sourced.engine import Engine 3 | from pyspark.sql import SparkSession 4 | 5 | def main(): 6 | file_path = os.path.dirname(os.path.realpath(__file__)) 7 | repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') 8 | session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() 9 | engine = Engine(session, repos_path, "siva") 10 | engine.repositories.references.master_ref.commits.show() 11 | 12 | 13 | if __name__ == '__main__': 14 | main() 15 | -------------------------------------------------------------------------------- /python/sourced/examples/repo_files.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | from sourced.engine import Engine 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def main(): 8 | file_path = os.path.dirname(os.path.realpath(__file__)) 9 | repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') 10 | session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() 11 | engine = Engine(session, repos_path, "siva") 12 | rows = engine.repositories.references.head_ref.commits\ 13 | .tree_entries.select('path').collect() 14 | 15 | files = [r['path'] for r in rows] 16 | 17 | print("FILES:") 18 | for f in files: 19 | print(f) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /python/sourced/examples/repo_references.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | from sourced.engine import Engine 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def main(): 8 | file_path = os.path.dirname(os.path.realpath(__file__)) 9 | repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') 10 | session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() 11 | engine = Engine(session, repos_path, "siva") 12 | refs = engine.repositories.filter('id = "github.com/xiyou-linuxer/faq-xiyoulinux"')\ 13 | .references.select('name').collect() 14 | 15 | refs = [r['name'] for r in refs] 16 | 17 | print("REFERENCES:") 18 | for r in refs: 19 | print(r) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /python/sourced/examples/repos.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | from sourced.engine import Engine 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def main(): 8 | file_path = os.path.dirname(os.path.realpath(__file__)) 9 | repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') 10 | session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() 11 | engine = Engine(session, repos_path, "siva") 12 | rows = engine.repositories.select('id').collect() 13 | 14 | repos = [r['id'] for r in rows] 15 | 16 | print("REPOS:") 17 | for r in repos: 18 | print(r) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /python/sourced/examples/uasts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sourced.engine import Engine 3 | from pyspark.sql import SparkSession 4 | 5 | def main(): 6 | file_path = os.path.dirname(os.path.realpath(__file__)) 7 | repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') 8 | session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() 9 | engine = Engine(session, repos_path, "siva") 10 | engine.repositories.references\ 11 | .filter('name = "refs/heads/develop"')\ 12 | .commits.tree_entries.blobs\ 13 | .classify_languages()\ 14 | .filter('lang = "Ruby"')\ 15 | .extract_uasts()\ 16 | .show() 17 | 18 | 19 | if __name__ == '__main__': 20 | main() 21 | -------------------------------------------------------------------------------- /python/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/python/test/__init__.py -------------------------------------------------------------------------------- /python/test/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import realpath, dirname, join 3 | from glob import glob 4 | 5 | jars_path = join(dirname(dirname(realpath(__file__))), "jars") 6 | jars = ':'.join(glob(join(jars_path, '*.jar'))) 7 | os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars %s pyspark-shell' % jars 8 | 9 | from pyspark.sql import SparkSession 10 | from unittest import TestCase 11 | 12 | 13 | class BaseTestCase(TestCase): 14 | def setUp(self): 15 | self.session = SparkSession.builder.appName("test").master("local[*]").getOrCreate() -------------------------------------------------------------------------------- /python/test/test_sourced_dataframe.py: -------------------------------------------------------------------------------- 1 | from sourced.engine import SourcedDataFrame 2 | from .base import BaseTestCase 3 | 4 | 5 | class SourcedDataFrameTestCase(BaseTestCase): 6 | def setUp(self): 7 | BaseTestCase.setUp(self) 8 | df = self.session.createDataFrame([('Alice', 18), ('Amy', 23), ('Cole', 22), ('Aaron', 25), ('Sue', 52)]) 9 | self.df = SourcedDataFrame(df._jdf, self.session, None) 10 | 11 | 12 | def test_filter(self): 13 | self.assert_names(self.df.filter(self.df[1] % 2 == 0), 14 | ['Alice', 'Cole', 'Sue']) 15 | 16 | def test_sort(self): 17 | self.assert_names(self.df.sort(self.df[1]), 18 | ['Alice', 'Cole', 'Amy', 'Aaron', 'Sue']) 19 | 20 | 21 | def assert_names(self, df, names): 22 | result = [r[0] for r in df.select(df[0]).collect()] 23 | self.assertEqual(result, names) -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 2 | Scalastyle standard configuration 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/UtilsWrapper.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark 2 | 3 | import org.apache.spark.util.Utils 4 | 5 | object UtilsWrapper { 6 | def getLocalDir(conf: SparkConf): String = Utils.getLocalDir(conf) 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.apache.spark.groupon.metrics.UserMetricsSystem 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} 6 | import org.apache.spark.sql.sources._ 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.{Row, SQLContext, SparkSession} 9 | import org.apache.spark.{SparkException, UtilsWrapper} 10 | import tech.sourced.engine.iterator._ 11 | import tech.sourced.engine.provider.{RepositoryProvider, RepositoryRDDProvider} 12 | 13 | /** 14 | * Default source to provide new git relations. 15 | */ 16 | class DefaultSource extends RelationProvider with DataSourceRegister { 17 | 18 | /** @inheritdoc */ 19 | override def shortName: String = "git" 20 | 21 | /** @inheritdoc */ 22 | override def createRelation(sqlContext: SQLContext, 23 | parameters: Map[String, String]): BaseRelation = { 24 | val table = parameters.getOrElse( 25 | DefaultSource.TableNameKey, 26 | throw new SparkException("parameter 'table' must be provided") 27 | ) 28 | 29 | val schema: StructType = Schema(table) 30 | 31 | GitRelation(sqlContext.sparkSession, schema, tableSource = Some(table)) 32 | } 33 | 34 | } 35 | 36 | /** 37 | * Just contains some useful constants for the DefaultSource class to use. 38 | */ 39 | object DefaultSource { 40 | val TableNameKey = "table" 41 | val PathKey = "path" 42 | } 43 | 44 | /** 45 | * A relation based on git data from rooted repositories in siva files. The data this relation 46 | * will offer depends on the given `tableSource`, which controls the table that will be accessed. 47 | * Also, the [[tech.sourced.engine.rule.GitOptimizer]] might merge some table sources into one by 48 | * squashing joins, so the result will be the resultant table chained with the previous one using 49 | * chained iterators. 50 | * 51 | * @param session Spark session 52 | * @param schema schema of the relation 53 | * @param joinConditions join conditions, if any 54 | * @param tableSource source table if any 55 | */ 56 | case class GitRelation(session: SparkSession, 57 | schema: StructType, 58 | joinConditions: Option[Expression] = None, 59 | tableSource: Option[String] = None) 60 | extends BaseRelation with CatalystScan { 61 | 62 | private val localPath: String = UtilsWrapper.getLocalDir(session.sparkContext.getConf) 63 | private val path: String = session.conf.get(RepositoriesPathKey) 64 | private val repositoriesFormat: String = session.conf.get(RepositoriesFormatKey) 65 | private val skipCleanup: Boolean = session.conf. 66 | get(SkipCleanupKey, default = "false").toBoolean 67 | private val skipReadErrors: Boolean = session.conf. 68 | get(SkipReadErrorsKey, default = "false").toBoolean 69 | private val parallelism: Int = session.sparkContext.defaultParallelism 70 | 71 | // this needs to be overridden to extend BaseRelataion, 72 | // though is not very useful since already we have the SparkSession 73 | override def sqlContext: SQLContext = session.sqlContext 74 | 75 | override def unhandledFilters(filters: Array[Filter]): Array[Filter] = { 76 | super.unhandledFilters(filters) 77 | } 78 | 79 | override def buildScan(requiredColumns: Seq[Attribute], 80 | filters: Seq[Expression]): RDD[Row] = { 81 | val sc = session.sparkContext 82 | val reposRDD = RepositoryRDDProvider(sc).get(path, repositoriesFormat) 83 | 84 | val requiredCols = sc.broadcast(requiredColumns.map(_.name).toArray) 85 | val reposLocalPath = sc.broadcast(localPath) 86 | val sources = sc.broadcast(Sources.getSources(tableSource, schema)) 87 | val filtersBySource = sc.broadcast(Sources.getFiltersBySource(filters)) 88 | 89 | reposRDD.flatMap(source => { 90 | val provider = RepositoryProvider(reposLocalPath.value, skipCleanup, parallelism * 2) 91 | 92 | val repo = UserMetricsSystem.timer("RepositoryProvider").time({ 93 | provider.get(source) 94 | }) 95 | 96 | // since the sources are ordered by their hierarchy, we can chain them like this 97 | // using the last used iterator as input for the current one 98 | var iter: Option[ChainableIterator[_]] = None 99 | sources.value.foreach({ 100 | case k@"repositories" => 101 | iter = Some(new RepositoryIterator( 102 | source.root, 103 | requiredCols.value, 104 | repo, 105 | filtersBySource.value.getOrElse(k, Seq()), 106 | skipReadErrors 107 | )) 108 | 109 | case k@"references" => 110 | iter = Some(new ReferenceIterator( 111 | requiredCols.value, 112 | repo, 113 | iter.map(_.asInstanceOf[RepositoryIterator]).orNull, 114 | filtersBySource.value.getOrElse(k, Seq()), 115 | skipReadErrors 116 | )) 117 | 118 | case k@"commits" => 119 | iter = Some(new CommitIterator( 120 | requiredCols.value, 121 | repo, 122 | iter.map(_.asInstanceOf[ReferenceIterator]).orNull, 123 | filtersBySource.value.getOrElse(k, Seq()), 124 | skipReadErrors 125 | )) 126 | 127 | case k@"tree_entries" => 128 | iter = Some(new GitTreeEntryIterator( 129 | requiredCols.value, 130 | repo, 131 | iter.map(_.asInstanceOf[CommitIterator]).orNull, 132 | filtersBySource.value.getOrElse(k, Seq()), 133 | skipReadErrors 134 | )) 135 | 136 | case k@"blobs" => 137 | iter = Some(new BlobIterator( 138 | requiredCols.value, 139 | repo, 140 | iter.map(_.asInstanceOf[GitTreeEntryIterator]).orNull, 141 | filtersBySource.value.getOrElse(k, Seq()), 142 | skipReadErrors 143 | )) 144 | 145 | case other => throw new SparkException(s"required cols for '$other' is not supported") 146 | }) 147 | 148 | // FIXME: when the RDD is persisted to disk the last element of this iterator is closed twice 149 | new CleanupIterator(iter.getOrElse(Seq().toIterator), provider.close(source, repo)) 150 | }) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/Schema.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.apache.spark.SparkException 4 | import org.apache.spark.sql.catalyst.expressions.Attribute 5 | import org.apache.spark.sql.types._ 6 | 7 | /** 8 | * Schema contains all the schemas of the multiple tables offered by this library. 9 | */ 10 | private[engine] object Schema { 11 | 12 | /** 13 | * Repositories table schema. Contains just the identifier of the repository, 14 | * its URLs and whether it's a fork or not. 15 | */ 16 | val repositories = StructType( 17 | StructField("id", StringType, nullable = false) :: 18 | StructField("urls", ArrayType(StringType, containsNull = false), nullable = false) :: 19 | StructField("is_fork", BooleanType) :: 20 | StructField("repository_path", StringType) :: 21 | Nil 22 | ) 23 | 24 | /** 25 | * References table schema containing the repository to which they belong, 26 | * the name and the hash of the object they point to. 27 | */ 28 | val references = StructType( 29 | StructField("repository_id", StringType, nullable = false) :: 30 | StructField("name", StringType, nullable = false) :: 31 | StructField("hash", StringType, nullable = false) :: 32 | StructField("is_remote", BooleanType, nullable = false) :: 33 | Nil 34 | ) 35 | 36 | /** 37 | * Commits table schema containing all the data about commits. 38 | */ 39 | val commits = StructType( 40 | StructField("repository_id", StringType, nullable = false) :: 41 | StructField("reference_name", StringType, nullable = false) :: 42 | StructField("index", IntegerType, nullable = false) :: 43 | StructField("hash", StringType, nullable = false) :: 44 | StructField("message", StringType, nullable = false) :: 45 | StructField("parents", ArrayType(StringType, containsNull = false)) :: 46 | StructField("parents_count", IntegerType, nullable = false) :: 47 | 48 | StructField("author_email", StringType) :: 49 | StructField("author_name", StringType) :: 50 | StructField("author_date", TimestampType) :: 51 | 52 | StructField("committer_email", StringType) :: 53 | StructField("committer_name", StringType) :: 54 | StructField("committer_date", TimestampType) :: 55 | 56 | Nil 57 | ) 58 | 59 | /** 60 | * Tree Entries table schema containing all the tree entries data. 61 | */ 62 | val treeEntries = StructType( 63 | StructField("commit_hash", StringType, nullable = false) :: 64 | StructField("repository_id", StringType, nullable = false) :: 65 | StructField("reference_name", StringType, nullable = false) :: 66 | StructField("path", StringType, nullable = false) :: 67 | StructField("blob", StringType, nullable = false) :: 68 | Nil 69 | ) 70 | 71 | /** 72 | * Blobs table schema containing all the blobs data. 73 | */ 74 | val blobs = StructType( 75 | StructField("blob_id", StringType, nullable = false) :: 76 | StructField("commit_hash", StringType, nullable = false) :: 77 | StructField("repository_id", StringType, nullable = false) :: 78 | StructField("reference_name", StringType, nullable = false) :: 79 | StructField("content", BinaryType) :: 80 | StructField("is_binary", BooleanType, nullable = false) :: 81 | Nil 82 | ) 83 | 84 | /** 85 | * Return the schema for the table with the given name. Throws a SparkException 86 | * if there is no schema for the given table. 87 | * 88 | * @param table name 89 | * @return schema for the table 90 | * @throws SparkException if the table does not exist 91 | */ 92 | def apply(table: String): StructType = table match { 93 | case "repositories" => Schema.repositories 94 | case "references" => Schema.references 95 | case "commits" => Schema.commits 96 | case "tree_entries" => Schema.treeEntries 97 | case "blobs" => Schema.blobs 98 | case other => throw new SparkException(s"table '$other' is not supported") 99 | } 100 | 101 | /** 102 | * Returns a tuple with the table and column names for the given attribute. 103 | * Because metadata tables are different from git relation tables, some fields 104 | * need to be mapped to match one schema with the other. 105 | * 106 | * @param attr attribute from the git relation schema 107 | * @return table and column names 108 | */ 109 | def metadataTableAndCol(attr: Attribute): (String, String) = { 110 | val name = attr.name 111 | val table = attr.metadata.getString(Sources.SourceKey) 112 | metadataMappings(table, name).getOrElse((table, name)) 113 | } 114 | 115 | /** 116 | * Mappings between a table name and column name in the git relation schema 117 | * and their counterpart in the metadata schema. 118 | * 119 | * @param table table name 120 | * @param name column name 121 | * @return a tuple with table and column name or None if there is no mapping 122 | */ 123 | def metadataMappings(table: String, name: String): Option[(String, String)] = 124 | Option((table, name) match { 125 | case ("commits", "index") => 126 | (RepositoryHasCommitsTable, "index") 127 | case ("commits", "repository_id") => 128 | (RepositoryHasCommitsTable, "repository_id") 129 | case ("commits", "reference_name") => 130 | (RepositoryHasCommitsTable, "reference_name") 131 | case ("tree_entries", "repository_id") => 132 | (RepositoryHasCommitsTable, "repository_id") 133 | case ("tree_entries", "reference_name") => 134 | (RepositoryHasCommitsTable, "reference_name") 135 | case _ => null 136 | }) 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/Sources.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.apache.spark.sql.catalyst.expressions.Expression 4 | import org.apache.spark.sql.types.StructType 5 | import tech.sourced.engine.util.{CompiledFilter, Filter} 6 | 7 | /** 8 | * Defines the hierarchy between data sources. 9 | */ 10 | object Sources { 11 | 12 | val SourceKey: String = "source" 13 | 14 | /** Sources ordered by their position in the hierarchy. */ 15 | val orderedSources = Array( 16 | "repositories", 17 | "references", 18 | "commits", 19 | "tree_entries", 20 | "blobs" 21 | ) 22 | 23 | /** 24 | * Compares two sources. 25 | * 26 | * @param a first source 27 | * @param b second source 28 | * @return comparison result 29 | */ 30 | def compare(a: String, b: String): Int = orderedSources.indexOf(a) 31 | .compareTo(orderedSources.indexOf(b)) 32 | 33 | /** 34 | * Returns the list of sources in the schema or the table source if any. 35 | * 36 | * @param tableSource optional source table 37 | * @param schema resultant schema 38 | * @return sequence with table sources 39 | */ 40 | def getSources(tableSource: Option[String], 41 | schema: StructType): Seq[String] = 42 | tableSource match { 43 | case Some(ts) => Seq(ts) 44 | case None => 45 | schema 46 | .map(_.metadata.getString(SourceKey)) 47 | .distinct 48 | .sortWith(Sources.compare(_, _) < 0) 49 | } 50 | 51 | def getFiltersBySource(filters: Seq[Expression]): Map[String, Seq[CompiledFilter]] = 52 | filters.flatMap(Filter.compile) 53 | .map(e => (e.sources.distinct, e)) 54 | .filter(_._1.lengthCompare(1) == 0) 55 | .groupBy(_._1) 56 | .map { case (k, v) => (k.head, v.map(_._2)) } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/TableBuilder.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import java.sql.{DriverManager, PreparedStatement} 4 | 5 | import org.apache.spark.SparkException 6 | import org.apache.spark.internal.Logging 7 | import org.apache.spark.sql.types._ 8 | 9 | private[engine] object TableBuilder { 10 | def columnSql(field: StructField): String = 11 | s"`${field.name}` ${sqlType(field.dataType)}" + 12 | (if (!field.nullable) s" NOT NULL" else "") 13 | 14 | def pkSql(cols: Seq[String]): Option[String] = if (cols.isEmpty) { 15 | None 16 | } else { 17 | Some(s"PRIMARY KEY (${cols.map(v => s"`$v`").mkString(", ")})") 18 | } 19 | 20 | def indexSql(table: String, col: String): String = 21 | s"CREATE INDEX `${table}_${col}_idx` ON $table(`$col`)" 22 | 23 | def sqlType(dt: DataType): String = dt match { 24 | case IntegerType => "INTEGER" 25 | case LongType | TimestampType => "BIGINT" 26 | case DoubleType => "DOUBLE PRECISION" 27 | case FloatType => "REAL" 28 | case ShortType | ByteType => "TINYINT" 29 | case StringType => "TEXT" 30 | case _ => throw new SparkException(s"there is no SQLite type for datatype $dt") 31 | } 32 | } 33 | 34 | private[engine] case class Table(name: String, 35 | pks: Seq[String], 36 | indexes: Seq[String]) extends Logging { 37 | private def sql(schema: StructType): Seq[String] = { 38 | Seq(s"CREATE TABLE $name (" + 39 | (schema.map(TableBuilder.columnSql) ++ TableBuilder.pkSql(pks)).mkString(",\n") 40 | + s")") ++ 41 | pks.map(TableBuilder.indexSql(name, _)) 42 | } 43 | 44 | def create(dbPath: String, schema: StructType): Unit = { 45 | val conn = DriverManager.getConnection(s"jdbc:sqlite:$dbPath") 46 | conn.setAutoCommit(false) 47 | try { 48 | sql(schema).foreach(sql => { 49 | log.debug(s"executing SQL statement for table `$name`: `$sql`") 50 | var stmt: PreparedStatement = null 51 | try { 52 | stmt = conn.prepareStatement(sql) 53 | stmt.execute() 54 | } finally { 55 | if (stmt != null) { 56 | stmt.close() 57 | } 58 | } 59 | }) 60 | conn.commit() 61 | } catch { 62 | case e: Exception => 63 | log.error(s"unable to create table $name and its indexes", e) 64 | conn.rollback() 65 | } finally { 66 | conn.close() 67 | } 68 | } 69 | } 70 | 71 | object Tables { 72 | 73 | val repositories = Table( 74 | prefix("repositories"), 75 | Seq("id"), 76 | Seq("repository_path") 77 | ) 78 | 79 | val references = Table( 80 | prefix("references"), 81 | Seq("name", "repository_id"), 82 | Seq() 83 | ) 84 | 85 | val commits = Table( 86 | prefix("commits"), 87 | Seq("hash"), 88 | Seq() 89 | ) 90 | 91 | val repoHasCommits = Table( 92 | prefix("repository_has_commits"), 93 | Seq("hash", "repository_id", "reference_name"), 94 | Seq("index") 95 | ) 96 | 97 | val treeEntries = Table( 98 | prefix("tree_entries"), 99 | // blob id can point to several paths, so we need this overly complex composite pk 100 | Seq("blob", "path", "commit_hash"), 101 | Seq() 102 | ) 103 | 104 | def apply(name: String): Table = name match { 105 | case "repositories" => repositories 106 | case "references" => references 107 | case "commits" => commits 108 | case "repository_has_commits" => repoHasCommits 109 | case "tree_entries" => treeEntries 110 | } 111 | 112 | def prefix(name: String): String = s"engine_$name" 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/compat/compat.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.compat 2 | 3 | import org.apache.spark.SPARK_VERSION 4 | import org.apache.spark.sql.catalyst.catalog.CatalogTable 5 | import org.apache.spark.sql.catalyst.expressions.AttributeReference 6 | import org.apache.spark.sql.execution.datasources.{ 7 | LogicalRelation => SparkLogicalRelation 8 | } 9 | import org.apache.spark.sql.sources.BaseRelation 10 | 11 | import scala.reflect.runtime.{universe => ru} 12 | 13 | private[compat] object Compat { 14 | 15 | def apply[T](s22: T, s23: T): T = SPARK_VERSION match { 16 | case s if s.startsWith("2.2.") => s22 17 | case s if s.startsWith("2.3.") => s23 18 | case _ => 19 | throw new RuntimeException(s"Unsupported SPARK_VERSION: $SPARK_VERSION") 20 | } 21 | 22 | lazy val ClassMirror = ru.runtimeMirror(Compat.getClass.getClassLoader) 23 | 24 | } 25 | 26 | private[engine] object LogicalRelation { 27 | 28 | def apply(rel: BaseRelation, 29 | out: Seq[AttributeReference], 30 | catalog: Option[CatalogTable]): SparkLogicalRelation = 31 | applyImpl(rel, out, catalog) 32 | 33 | private lazy val applyImpl = 34 | Compat(applySpark22(_, _, _), applySpark23(_, _, _)) 35 | 36 | private lazy val typ = ru.typeOf[SparkLogicalRelation] 37 | private lazy val classSymbol = 38 | Compat.ClassMirror.reflectClass(typ.typeSymbol.asClass) 39 | private lazy val ctor = 40 | classSymbol.reflectConstructor(typ.decl(ru.termNames.CONSTRUCTOR).asMethod) 41 | 42 | def applySpark22(rel: BaseRelation, 43 | out: Seq[AttributeReference], 44 | catalog: Option[CatalogTable]): SparkLogicalRelation = 45 | ctor(rel, out, catalog).asInstanceOf[SparkLogicalRelation] 46 | 47 | def applySpark23(rel: BaseRelation, 48 | out: Seq[AttributeReference], 49 | catalog: Option[CatalogTable]): SparkLogicalRelation = 50 | ctor(rel, out, catalog, false).asInstanceOf[SparkLogicalRelation] 51 | 52 | def unapply(arg: SparkLogicalRelation) 53 | : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = 54 | unapplyImpl(arg) 55 | 56 | private lazy val unapplyImpl = Compat(unapplySpark22(_), unapplySpark23(_)) 57 | 58 | def unapplySpark22(arg: SparkLogicalRelation) 59 | : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = 60 | Some((arg.relation, arg.output, arg.catalogTable)) 61 | 62 | def unapplySpark23(arg: SparkLogicalRelation) 63 | : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = { 64 | val isStreaming = Compat.ClassMirror 65 | .reflect(arg) 66 | .reflectField(typ.decl(ru.TermName("isStreaming")).asTerm) 67 | .get 68 | .asInstanceOf[Boolean] 69 | if (isStreaming) { 70 | None 71 | } else { 72 | Some((arg.relation, arg.output, arg.catalogTable)) 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/exception/RepositoryException.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.exception 2 | 3 | import org.eclipse.jgit.lib.Repository 4 | 5 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter 6 | 7 | /** 8 | * Exception to add repository debug information to any 9 | * uncontrolled exception. It does not add a stacktrace level. 10 | * 11 | * @param repo Repository that was beeing iterated 12 | * @param cause Original exception 13 | */ 14 | class RepositoryException(repo: Repository, cause: Throwable) 15 | extends Exception( 16 | s"Repository error with data: ${RepositoryException.repoInfo(repo)}", 17 | cause, 18 | true, 19 | false 20 | ) {} 21 | 22 | object RepositoryException { 23 | 24 | def apply(repo: Repository, cause: Throwable): RepositoryException = { 25 | new RepositoryException(repo, cause) 26 | } 27 | 28 | /** 29 | * Returns a string with a debug description of the repository 30 | * @param repo Repository to describe 31 | * @return 32 | */ 33 | def repoInfo(repo: Repository): String = { 34 | val repoPath = try { 35 | repo.toString 36 | } catch { 37 | case _: Throwable => "Unknown repository path" 38 | } 39 | 40 | try { 41 | val c = repo.getConfig 42 | val remotes = c.getSubsections("remote").asScala 43 | val urls = remotes.flatMap(r => c.getStringList("remote", r, "url")) 44 | 45 | if (urls.isEmpty) { 46 | repoPath 47 | } else { 48 | s"$repoPath; urls ${urls.toSet.mkString(", ")}" 49 | } 50 | } catch { 51 | case e: Throwable => 52 | s"Exception in RepositoryException.repoInfo for $repoPath: ${e.getMessage}" 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/BlobIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.eclipse.jgit.diff.RawText 5 | import org.eclipse.jgit.errors.MissingObjectException 6 | import org.eclipse.jgit.lib.{ObjectId, Repository} 7 | import tech.sourced.engine.exception.RepositoryException 8 | import tech.sourced.engine.util.{CompiledFilter, Filters} 9 | 10 | /** 11 | * Iterator that will return rows of blobs in a repository. 12 | * 13 | * @param finalColumns final columns that must be in the resultant row 14 | * @param repo repository to get the data from 15 | * @param prevIter previous iterator, if the iterator is chained 16 | * @param filters filters for the iterator 17 | */ 18 | class BlobIterator(finalColumns: Array[String], 19 | repo: Repository, 20 | prevIter: TreeEntryIterator, 21 | filters: Seq[CompiledFilter], 22 | skipReadErrors: Boolean) 23 | extends ChainableIterator[Blob]( 24 | finalColumns, 25 | Option(prevIter).orNull, 26 | filters, 27 | repo, 28 | skipReadErrors 29 | ) with Logging { 30 | 31 | /** @inheritdoc*/ 32 | override protected def loadIterator(compiledFilters: Seq[CompiledFilter]): Iterator[Blob] = { 33 | val filters = Filters(compiledFilters) 34 | val treeEntryIter = Option(prevIter) match { 35 | case Some(it) => 36 | Seq(it.currentRow).toIterator 37 | case None => GitTreeEntryIterator.loadIterator( 38 | repo, 39 | None, 40 | filters, 41 | blobIdKey = "blob_id" 42 | ) 43 | } 44 | 45 | val iter = treeEntryIter.flatMap(entry => { 46 | if (repo.hasObject(entry.blob)) { 47 | Some( 48 | Blob( 49 | entry.blob, 50 | entry.commitHash, 51 | entry.ref, 52 | entry.repo, 53 | BlobIterator.readFile( 54 | entry.blob, 55 | repo 56 | ) 57 | )) 58 | } else { 59 | None 60 | } 61 | }) 62 | 63 | if (filters.hasFilters("blob_id")) { 64 | iter.filter(b => filters.matches(Seq("blob_id"), b.id.getName)) 65 | } else { 66 | iter 67 | } 68 | } 69 | 70 | override protected def mapColumns(blob: Blob): RawRow = { 71 | val isBinary = RawText.isBinary(blob.content) 72 | 73 | Map[String, Any]( 74 | "commit_hash" -> blob.commit.getName, 75 | "repository_id" -> blob.repo, 76 | "reference_name" -> blob.ref, 77 | "blob_id" -> blob.id.getName, 78 | "content" -> (if (isBinary) Array.emptyByteArray else blob.content), 79 | "is_binary" -> isBinary 80 | ) 81 | } 82 | 83 | } 84 | 85 | case class Blob(id: ObjectId, 86 | commit: ObjectId, 87 | ref: String, 88 | repo: String, 89 | content: Array[Byte]) 90 | 91 | object BlobIterator extends Logging { 92 | /** Max bytes to read for the content of a file. */ 93 | val readMaxBytes: Int = 20 * 1024 * 1024 94 | 95 | /** 96 | * Read max N bytes of the given blob 97 | * 98 | * @param objId ID of the object to read 99 | * @param repo repository to get the data from 100 | * @param max maximum number of bytes to read in memory 101 | * @return Bytearray with the contents of the file 102 | */ 103 | def readFile(objId: ObjectId, repo: Repository, max: Integer = readMaxBytes): Array[Byte] = { 104 | val reader = repo.newObjectReader() 105 | val obj = try { 106 | reader.open(objId) 107 | } catch { 108 | case e: MissingObjectException => 109 | log.warn(s"missing object", new RepositoryException(repo, e)) 110 | null 111 | } 112 | 113 | if (obj != null) { 114 | val data = if (obj.isLarge) { 115 | val buf = Array.ofDim[Byte](max) 116 | val is = obj.openStream() 117 | is.read(buf) 118 | is.close() 119 | buf 120 | } else { 121 | obj.getBytes 122 | } 123 | reader.close() 124 | data 125 | } else { 126 | Array.emptyByteArray 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/ChainableIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.Row 5 | import org.eclipse.jgit.api.errors.GitAPIException 6 | import org.eclipse.jgit.errors.{ 7 | IncorrectObjectTypeException, 8 | MissingObjectException, 9 | RevWalkException 10 | } 11 | import org.eclipse.jgit.lib.Repository 12 | import tech.sourced.engine.exception.RepositoryException 13 | import tech.sourced.engine.util.CompiledFilter 14 | 15 | import scala.annotation.tailrec 16 | 17 | /** 18 | * Iterator that can have a previous iterator to output chained values. 19 | * 20 | * @param finalColumns final columns that must be in the resultant row 21 | * @param prevIter previous iterator, if the iterator is chained 22 | * @param filters filters for the iterator 23 | * @tparam T type of data returned by the internal iterator 24 | */ 25 | abstract class ChainableIterator[T](finalColumns: Array[String], 26 | prevIter: ChainableIterator[_], 27 | filters: Seq[CompiledFilter], 28 | val repo: Repository, 29 | skipReadErrors: Boolean 30 | ) extends Iterator[Row] with Logging { 31 | 32 | /** Raw values of the row. */ 33 | type RawRow = Map[String, Any] 34 | 35 | /** Instance of the internal iterator. */ 36 | private var iter: Iterator[T] = _ 37 | 38 | /** The current row of the prevIter, null always if there is no prevIter. */ 39 | private var prevIterCurrentRow: RawRow = _ 40 | 41 | /** The current row of the internal iterator. */ 42 | private[iterator] var currentRow: T = _ 43 | 44 | /** 45 | * Returns the internal iterator that will return the data used to construct the final row. 46 | * 47 | * @param filters filters for the iterator 48 | * @return internal iterator 49 | */ 50 | protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[T] 51 | 52 | /** 53 | * Loads the next internal iterator. 54 | * 55 | * @return internal iterator 56 | */ 57 | private def loadIterator: Iterator[T] = loadIterator(filters) 58 | 59 | /** 60 | * Given the object returned by the internal iterator, this method must transform 61 | * that object into a RawRow. 62 | * 63 | * @param obj object returned by the internal iterator 64 | * @return raw row 65 | */ 66 | protected def mapColumns(obj: T): RawRow 67 | 68 | @tailrec 69 | final override def hasNext: Boolean = { 70 | loadNext match { 71 | case Some(v) => v 72 | case None => hasNext 73 | } 74 | } 75 | 76 | /** 77 | * Load the next iterator and returns if there is a next item or not. If 78 | * it returns some value it means we know for sure there is something or 79 | * not. If it returns None, it means we don't know and another call to 80 | * loadNext is required. 81 | * 82 | * @return whether there is a next item in the iterator or not, or if we 83 | * don't know 84 | */ 85 | final def loadNext: Option[Boolean] = { 86 | try { 87 | // If there is no previous iter just load the iterator the first pass 88 | // and use hasNext of iter all the times. We return here to get rid of 89 | // this logic and assume from this point on that prevIter is not null 90 | if (prevIter == null) { 91 | if (iter == null) { 92 | iter = loadIterator 93 | } 94 | 95 | return Some(iter.hasNext) 96 | } 97 | 98 | // If the iter is not loaded, do so, but only if there are actually more 99 | // rows in the prev iter. If there are, just load the iter and preload 100 | // the prevIterCurrentRow. 101 | if (iter == null) { 102 | if (prevIter.isEmpty) { 103 | return Some(false) 104 | } 105 | 106 | prevIterCurrentRow = prevIter.nextRaw 107 | iter = loadIterator 108 | } 109 | 110 | // if iter is empty, we need to check if there are more rows in the prev iter 111 | // if not, just finish. If there are, preload the next raw row of the prev iter 112 | // and load the iterator again for the prev iter current row 113 | if (iter.hasNext) { 114 | Some(true) 115 | } else { 116 | if (prevIter.isEmpty) { 117 | return Some(false) 118 | } 119 | 120 | prevIterCurrentRow = prevIter.nextRaw 121 | iter = loadIterator 122 | 123 | None 124 | } 125 | } catch { 126 | case e: IncorrectObjectTypeException => 127 | log.debug("incorrect object type", new RepositoryException(repo, e)) 128 | None 129 | case e: MissingObjectException => 130 | log.warn("missing object", new RepositoryException(repo, e)) 131 | None 132 | case e: RevWalkException => 133 | log.warn("rev walk exception", new RepositoryException(repo, e)) 134 | None 135 | case e: GitAPIException => 136 | log.warn("git api exception", new RepositoryException(repo, e)) 137 | None 138 | case e@(_: Exception | _: RuntimeException) => 139 | if (skipReadErrors) { 140 | log.warn("read error skipped", new RepositoryException(repo, e)) 141 | None 142 | } else { 143 | throw new RepositoryException(repo, e) 144 | } 145 | case e: Throwable => 146 | throw e 147 | } 148 | } 149 | 150 | override def next: Row = { 151 | currentRow = iter.next 152 | // FIXME: if there's a repeated column name, value 153 | // will be the last one added. This could be solved by 154 | // qualifying all column names with their source. 155 | val mappedValues = if (prevIterCurrentRow != null) { 156 | prevIterCurrentRow ++ mapColumns(currentRow) 157 | } else { 158 | mapColumns(currentRow) 159 | } 160 | 161 | val values = finalColumns.map(c => mappedValues(c)) 162 | Row(values: _*) 163 | } 164 | 165 | 166 | def nextRaw: RawRow = { 167 | currentRow = iter.next 168 | val row = mapColumns(currentRow) 169 | if (prevIterCurrentRow != null) { 170 | prevIterCurrentRow ++ row 171 | } else { 172 | row 173 | } 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/CleanupIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.apache.spark.{InterruptibleIterator, TaskContext} 4 | import tech.sourced.engine.exception.RepositoryException 5 | 6 | /** 7 | * Iterator that calls a cleanup function after the given iterator has 8 | * finished or an exception has been thrown. 9 | * 10 | * @param it internal iterator 11 | * @param cleanup cleanup function 12 | * @tparam T type of the rows in the iterator 13 | */ 14 | class CleanupIterator[T](it: Iterator[T], cleanup: => Unit) 15 | extends InterruptibleIterator[T](TaskContext.get(), it) { 16 | 17 | /** @inheritdoc 18 | * 19 | * After catching an exception cleans up all the resources calling the cleanup function 20 | * and will rethrow such exception again. 21 | */ 22 | override def hasNext: Boolean = { 23 | try { 24 | val hasNext = super.hasNext 25 | if (!hasNext) { 26 | val _ = cleanup 27 | } 28 | hasNext 29 | } catch { 30 | case e: Throwable => 31 | val _ = cleanup 32 | throw e 33 | } 34 | } 35 | 36 | /** @inheritdoc*/ 37 | override def next(): T = super.next() 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/MetadataIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import java.sql.{Connection, DriverManager, ResultSet} 4 | 5 | import org.apache.spark.internal.Logging 6 | import org.apache.spark.sql.catalyst.expressions.Attribute 7 | import org.apache.spark.sql.types.{ArrayType, BooleanType, TimestampType} 8 | 9 | class MetadataIterator(fields: Seq[Attribute], 10 | dbPath: String, 11 | sql: String) 12 | extends Iterator[Map[String, Any]] with Logging { 13 | 14 | private val iter = new JDBCQueryIterator(fields, dbPath, sql) 15 | 16 | override def hasNext: Boolean = iter.hasNext 17 | 18 | override def next(): Map[String, Any] = { 19 | val values = iter.next() 20 | Map[String, Any](fields.zipWithIndex.map { 21 | case (attr, idx) if attr.dataType == BooleanType => 22 | (attr.name, values(idx) match { 23 | case 0 => false 24 | case 1 => true 25 | case _ => null 26 | }) 27 | case (attr, idx) if attr.dataType.isInstanceOf[ArrayType] => 28 | (attr.name, values(idx).toString.split("\\|")) 29 | case (attr, idx) if attr.dataType == TimestampType => 30 | (attr.name, new java.sql.Timestamp(values(idx).asInstanceOf[Long])) 31 | case (attr, idx) => 32 | (attr.name, values(idx)) 33 | }: _*) 34 | } 35 | 36 | def close(): Unit = iter.close() 37 | 38 | } 39 | 40 | class JDBCQueryIterator(fields: Seq[Attribute], 41 | dbPath: String, 42 | sql: String) 43 | extends Iterator[Array[Any]] with Logging { 44 | 45 | private var rs: ResultSet = _ 46 | private var conn: Connection = _ 47 | private var nextCollected = false 48 | private var hasRows = false 49 | 50 | private[iterator] def close(): Unit = { 51 | try { 52 | if (rs != null && !rs.isClosed) { 53 | rs.close() 54 | } 55 | } finally { 56 | if (conn != null && !conn.isClosed) { 57 | try { 58 | conn.close() 59 | } catch { 60 | case e: Exception => log.warn(s"could not close connection", e) 61 | } 62 | } 63 | } 64 | } 65 | 66 | override def hasNext: Boolean = { 67 | if (rs == null) { 68 | initializeResultSet() 69 | } else if (hasRows && !nextCollected) { 70 | // FIXME: RDD groupBy somehow calls hasNext twice, so we can't 71 | // advance the cursor until the next row has been collected to make sure 72 | // we don't skip rows. 73 | return true 74 | } 75 | 76 | try { 77 | if (!rs.isClosed && rs.next) { 78 | hasRows = true 79 | nextCollected = false 80 | true 81 | } else { 82 | close() 83 | false 84 | } 85 | } catch { 86 | case e: Exception => 87 | log.warn(s"caught an exception in JDBCIterator.hasNext", e) 88 | close() 89 | false 90 | } 91 | } 92 | 93 | private def initializeResultSet(): Unit = { 94 | conn = DriverManager.getConnection(s"jdbc:sqlite:$dbPath") 95 | val stmt = conn.prepareStatement(sql) 96 | try { 97 | rs = stmt.executeQuery() 98 | } catch { 99 | case e: Exception => 100 | log.warn(s"could not execute query", e) 101 | close() 102 | } 103 | } 104 | 105 | override def next(): Array[Any] = { 106 | nextCollected = true 107 | fields.zipWithIndex 108 | .map(f => rs.getObject(f._2 + 1)) 109 | .toArray 110 | .asInstanceOf[Array[Any]] 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/ReferenceIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.eclipse.jgit.lib.{ObjectId, Ref, Repository} 4 | import tech.sourced.engine.util.{CompiledFilter, Filters} 5 | 6 | import scala.collection.JavaConverters._ 7 | 8 | /** 9 | * Iterator that will return rows of references in a repository. 10 | * 11 | * @param finalColumns final columns that must be in the resultant row 12 | * @param repo repository to get the data from 13 | * @param prevIter previous iterator, if the iterator is chained 14 | * @param filters filters for the iterator 15 | */ 16 | class ReferenceIterator(finalColumns: Array[String], 17 | repo: Repository, 18 | prevIter: RepositoryIterator, 19 | filters: Seq[CompiledFilter], 20 | skipReadErrors: Boolean) 21 | extends ChainableIterator[Ref]( 22 | finalColumns, 23 | prevIter, 24 | filters, 25 | repo, 26 | skipReadErrors 27 | ) { 28 | 29 | /** @inheritdoc */ 30 | protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[Ref] = 31 | ReferenceIterator.loadIterator( 32 | repo, 33 | Option(prevIter).map(_.currentRow), 34 | Filters(filters) 35 | ) 36 | 37 | /** @inheritdoc */ 38 | override protected def mapColumns(ref: Ref): RawRow = { 39 | val (repoId, refName) = RootedRepo.parseRef(repo, ref.getName) 40 | Map[String, Any]( 41 | "repository_id" -> repoId, 42 | "name" -> refName, 43 | "hash" -> ObjectId.toString(Option(ref.getPeeledObjectId).getOrElse(ref.getObjectId)), 44 | "is_remote" -> RootedRepo.isRemote(repo, ref.getName) 45 | ) 46 | } 47 | } 48 | 49 | object ReferenceIterator { 50 | 51 | /** 52 | * Returns an iterator of references. 53 | * 54 | * @param repo repository to get the data from 55 | * @param filters filters to skip some rows. The only supported filters at the iterator 56 | * level are by repository id and by reference name. The keys of said filters 57 | * are controlled by the parameters `repoKey` and `refNameKey`. 58 | * @param repoKey name of the repository id filter key 59 | * @param refNameKey name of the reference name filter key 60 | * @return the iterator 61 | */ 62 | def loadIterator(repo: Repository, 63 | repoId: Option[String], 64 | filters: Filters, 65 | repoKey: String = "repository_id", 66 | refNameKey: String = "name"): Iterator[Ref] = { 67 | val repoKeys = Seq(repoKey) 68 | val repoIds: Array[String] = repoId match { 69 | case Some(id) => 70 | if (!filters.hasFilters(repoKeys: _*) || filters.matches(repoKeys, id)) { 71 | Array(id) 72 | } else { 73 | Array() 74 | } 75 | case None => 76 | RepositoryIterator.loadIterator(repo, filters, repoKey).toArray 77 | } 78 | 79 | val refNameKeys = Seq("name", refNameKey) 80 | val hasRefFilters = filters.hasFilters(refNameKeys: _*) 81 | val out = repo.getAllRefs.asScala.values.filter(ref => { 82 | val (repoId, refName) = RootedRepo.parseRef(repo, ref.getName) 83 | (repoIds.isEmpty || repoIds.contains(repoId)) && 84 | (!hasRefFilters || filters.matches(refNameKeys, refName)) 85 | }) 86 | 87 | out.toIterator 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/RepositoryIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.eclipse.jgit.lib.Repository 4 | import tech.sourced.engine.util.{CompiledFilter, Filters} 5 | 6 | /** 7 | * Iterator that will return rows of repositories in a repository. 8 | * 9 | * @param repositoryPath path of the given repository 10 | * @param finalColumns final columns that must be in the resultant row 11 | * @param repo repository to get the data from 12 | * @param filters filters for the iterator 13 | */ 14 | class RepositoryIterator(repositoryPath: String, 15 | finalColumns: Array[String], 16 | repo: Repository, 17 | filters: Seq[CompiledFilter], 18 | skipReadErrors: Boolean) 19 | extends ChainableIterator[String]( 20 | finalColumns, 21 | null, 22 | filters, 23 | repo, 24 | skipReadErrors 25 | ) { 26 | 27 | // since this iterator does not override getFilters method of RootedRepository 28 | // we can cache here the matching cases, because they are not going to change. 29 | private val matchingFilters = Filters(filters) 30 | 31 | /** @inheritdoc*/ 32 | override protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[String] = 33 | RepositoryIterator.loadIterator(repo, matchingFilters) 34 | 35 | /** @inheritdoc*/ 36 | override protected def mapColumns(id: String): RawRow = { 37 | val c = repo.getConfig 38 | val remote = RootedRepo.getRepositoryRemote(repo, id) 39 | val urls = remote.map(r => c.getStringList("remote", r, "url")) 40 | .orElse(Some(Array[String]())).get 41 | val isFork = remote.map(r => c.getBoolean("remote", r, "isfork", false)) 42 | .orElse(Some(false)).get 43 | 44 | Map[String, Any]( 45 | "id" -> id, 46 | "urls" -> urls, 47 | "is_fork" -> isFork, 48 | "repository_path" -> repositoryPath 49 | ) 50 | } 51 | } 52 | 53 | object RepositoryIterator { 54 | 55 | import scala.collection.JavaConverters._ 56 | 57 | /** 58 | * Returns an iterator of references. 59 | * 60 | * @param repo repository to get the data from 61 | * @param filters filters to skip some rows. The only supported filters at the iterator 62 | * level are by repository id. The key of said filters 63 | * are controlled by the parameter `repoKey`. 64 | * @param repoKey name of the repository id filter key 65 | * @return the iterator 66 | */ 67 | def loadIterator(repo: Repository, 68 | filters: Filters, 69 | repoKey: String = "id"): Iterator[String] = { 70 | // If there's any non-remote reference, it will show up here, thus 71 | // making the local repository appear. If we only take into account 72 | // the remotes the result will be different from the one returned by 73 | // the reference iterator. 74 | // This makes us process this twice in a chained reference iterator 75 | // scenario, even though the result would be correct without this, 76 | // but it's needed for correctness when the table is asked independently. 77 | val refRepos = repo.getAllRefs.asScala.keys 78 | .map(ref => RootedRepo.parseRef(repo, ref)._1) 79 | 80 | val repos = repo.getConfig.getSubsections("remote").asScala.toIterator 81 | .map(RootedRepo.getRepositoryId(repo, _).get) ++ refRepos 82 | 83 | val iter = repos.toList.distinct.toIterator 84 | 85 | val filterKeys = Seq("id", repoKey) 86 | if (filters.hasFilters(filterKeys: _*)) { 87 | iter.filter(filters.matches(filterKeys, _)) 88 | } else { 89 | iter 90 | } 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/iterator/RootedRepo.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.eclipse.jgit.lib.{Repository, StoredConfig} 4 | import tech.sourced.engine.util.GitUrlsParser 5 | import scala.collection.JavaConverters.collectionAsScalaIterableConverter 6 | 7 | object RootedRepo { 8 | 9 | /** 10 | * Returns the ID of a repository given its remote name. 11 | * 12 | * @param repo repository 13 | * @param remoteName remote name 14 | * @return repository ID 15 | */ 16 | private[iterator] def getRepositoryId(repo: Repository, remoteName: String): Option[String] = { 17 | // TODO: maybe a cache here could improve performance 18 | val c: StoredConfig = repo.getConfig 19 | c.getSubsections("remote").asScala.find(_ == remoteName) match { 20 | case None => None 21 | case Some(name) => Some(GitUrlsParser.getIdFromUrls( 22 | c.getStringList("remote", name, "url") 23 | )) 24 | } 25 | } 26 | 27 | /** 28 | * Returns the remote name of a repository with the given ID. 29 | * 30 | * @param repo repository 31 | * @param id repository id 32 | * @return remote name 33 | */ 34 | private[iterator] def getRepositoryRemote(repo: Repository, id: String): Option[String] = { 35 | // TODO: maybe a cache here could improve performance 36 | val c: StoredConfig = repo.getConfig 37 | c.getSubsections("remote").asScala.find(remoteName => { 38 | val actualId: String = 39 | GitUrlsParser.getIdFromUrls(c.getStringList("remote", remoteName, "url")) 40 | 41 | actualId == id 42 | }) 43 | } 44 | 45 | /** 46 | * Parses a reference name and returns a tuple with the repository id and the reference name. 47 | * 48 | * @param repo repository 49 | * @param ref reference name 50 | * @return tuple with repository id and reference name 51 | */ 52 | private[iterator] def parseRef(repo: Repository, ref: String): (String, String) = { 53 | val split: Array[String] = ref.split("/") 54 | val uuid: String = split.last 55 | 56 | // if it's a siva file, the last part will be the uuid of the repository, which 57 | // is the name of the remote associated to that particular repository 58 | getRepositoryId(repo, uuid) match { 59 | case Some(repoId) => 60 | val refName: String = split.init.mkString("/") 61 | 62 | (repoId, refName) 63 | 64 | // If no uuid matches, it means this is not a siva file, so we should find this 65 | // using the whole reference name 66 | case None => 67 | val c: StoredConfig = repo.getConfig 68 | val refRemote = repo.getRemoteName(ref) 69 | val repoId = c.getSubsections("remote").asScala 70 | .find(_ == refRemote) 71 | .map(r => GitUrlsParser.getIdFromUrls(c.getStringList("remote", r, "url"))) 72 | .orNull 73 | 74 | if (repoId == null) { 75 | // if branch is local, use the repo path as directory 76 | // since there's no way to tell to which remote it belongs (probably none) 77 | val repoPath = if (repo.getDirectory.toPath.getFileName.toString == ".git") { 78 | // non-bare repositories will have the .git directory as their directory 79 | // so we'll use the parent 80 | repo.getDirectory.toPath.getParent 81 | } else { 82 | repo.getDirectory.toPath 83 | } 84 | 85 | ("file://" + repoPath, ref) 86 | } else { 87 | (repoId, ref.replace(s"refs/remotes/$refRemote", "refs/heads")) 88 | } 89 | } 90 | } 91 | 92 | private[iterator] def isRemote(repo: Repository, ref: String): Boolean = { 93 | val split: Array[String] = ref.split("/") 94 | val uuid: String = split.last 95 | 96 | // if it's a siva file, the last part will be the uuid of the repository, which 97 | // is the name of the remote associated to that particular repository 98 | getRepositoryId(repo, uuid) match { 99 | case Some(_) => 100 | true // is a siva file 101 | 102 | // If no uuid matches, it means this is not a siva file, so we should find this 103 | // using the whole reference name 104 | case None => 105 | Option(repo.getRemoteName(ref)).isDefined 106 | } 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/provider/ReadOnlyFileRepository.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.provider 2 | 3 | import java.io.File 4 | 5 | import org.eclipse.jgit.internal.storage.file.FileRepository 6 | import org.eclipse.jgit.storage.file.FileBasedConfig 7 | 8 | /** 9 | * [[FileRepository]] implementation for read-only repositories. 10 | * 11 | * Some operations are performance optimized for this case. If the underlying repository changes, 12 | * usage of this repository implementation might lead to unexpected results. 13 | * 14 | * @param gitDir Path to the git directory. 15 | */ 16 | private[provider] class ReadOnlyFileRepository(gitDir: File) extends FileRepository(gitDir) { 17 | 18 | /** @inheritdoc */ 19 | override lazy val getConfig: FileBasedConfig = { 20 | //XXX: repoConfig is initialized in FileRepository's constructor. 21 | // Here we always return it without checking for changes in the underlying 22 | // filesystem. This prevents checking for last modification date of configuration 23 | // files on every operation. 24 | val accessor = classOf[FileRepository].getDeclaredField("repoConfig") 25 | accessor.setAccessible(true) 26 | accessor.get(this).asInstanceOf[FileBasedConfig] 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/provider/RepositoryRDDProvider.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.provider 2 | 3 | import java.util.concurrent.ConcurrentHashMap 4 | 5 | import org.apache.hadoop.fs.Path 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.input.PortableDataStream 8 | import org.apache.spark.rdd.RDD 9 | 10 | import scala.collection.concurrent 11 | import scala.collection.convert.decorateAsScala._ 12 | 13 | /** 14 | * Provides an RDD of repositories in the following forms: 15 | * - siva files 16 | * - bare repositories 17 | * - regular git repositories 18 | * 19 | * @param sc Spark Context 20 | */ 21 | class RepositoryRDDProvider(sc: SparkContext) { 22 | private val rdd: concurrent.Map[String, RDD[RepositorySource]] = 23 | new ConcurrentHashMap[String, RDD[RepositorySource]]().asScala 24 | 25 | /** 26 | * Generates an RDD of repositories with their source at the given path. 27 | * Path may be remote or local. 28 | * 29 | * @param path Path where the repositories are stored. 30 | * @param repositoriesFormat Format of the repositories that are inside the provided path 31 | * @return RDD of repositories 32 | */ 33 | def get(path: String, repositoriesFormat: String): RDD[RepositorySource] = 34 | rdd.getOrElse(path, RepositoryRDDProvider.generateRDD(sc, path, repositoriesFormat)) 35 | } 36 | 37 | /** 38 | * Provides some utility methods for [[RepositoryRDDProvider]] class. 39 | * Acts as a singleton for getting an unique instance of [[RepositoryRDDProvider]]s, so the 40 | * recommended way of using said class is using this companion object. 41 | */ 42 | object RepositoryRDDProvider { 43 | val SivaFormat: String = "siva" 44 | val BareFormat: String = "bare" 45 | val StandardFormat: String = "standard" 46 | 47 | /** The singleton Siva RDD provider. */ 48 | var provider: RepositoryRDDProvider = _ 49 | 50 | /** 51 | * Returns the provider instance and creates one if none has been created yet. 52 | * 53 | * @param sc Spark Context 54 | * @return RepositorySource RDD provider 55 | */ 56 | def apply(sc: SparkContext): RepositoryRDDProvider = { 57 | Option(provider).getOrElse({ 58 | provider = new RepositoryRDDProvider(sc) 59 | provider 60 | }) 61 | } 62 | 63 | /** 64 | * Generates an RDD of [[RepositorySource]] with the repositories at the given path. 65 | * Allows bucketing of siva files and raw repositories. 66 | * 67 | * @param sc Spark Context 68 | * @param path path to get the repositories from 69 | * @param repositoriesFormat format of the repositories inside the provided path 70 | * @return generated RDD 71 | */ 72 | private def generateRDD(sc: SparkContext, 73 | path: String, 74 | repositoriesFormat: String): RDD[RepositorySource] = { 75 | repositoriesFormat match { 76 | case SivaFormat => 77 | sc.binaryFiles(s"$path/*").flatMap(b => if (b._1.endsWith(".siva")) { 78 | Some(SivaRepository(b._2)) 79 | } else { 80 | None 81 | }) 82 | case StandardFormat | BareFormat => 83 | sc.binaryFiles(s"$path/**/*").map { 84 | case (path: String, pds: PortableDataStream) => 85 | // returns a tuple of the root directory where it is contained, with a maximum depth 86 | // of 1 under the given path, the file name, and the portable data stream 87 | val idx = path.indexOf('/', path.length + 1) 88 | if (idx < 0) { 89 | val p = new Path(path) 90 | (p.getParent.toString, (p.getName, pds)) 91 | } else { 92 | val (parent, file) = path.splitAt(idx) 93 | (parent, (file, pds)) 94 | } 95 | }.groupByKey() 96 | .map { 97 | case (dir, files) => 98 | if (repositoriesFormat == StandardFormat) { 99 | GitRepository(dir, files.head._2) 100 | } else { 101 | BareRepository(dir, files.head._2) 102 | } 103 | } 104 | case other => throw new RuntimeException(s"Repository format $other is not supported") 105 | } 106 | } 107 | 108 | } 109 | 110 | /** 111 | * RepositorySource is a repository that comes from a certain source. 112 | */ 113 | sealed trait RepositorySource extends Serializable { 114 | /** 115 | * Returns the portable data stream of one of the repository files. In the case 116 | * of siva files, of the siva file itself. 117 | * 118 | * @return portable data stream 119 | */ 120 | def pds: PortableDataStream 121 | 122 | /** 123 | * Returns the path to the root of the repository. In the case of siva files, the 124 | * path to the siva file itself. 125 | * 126 | * @return path to the repository root 127 | */ 128 | def root: String 129 | } 130 | 131 | /** 132 | * Repository coming from a siva file. 133 | * 134 | * @param pds portable data stream of the siva file 135 | */ 136 | case class SivaRepository(pds: PortableDataStream) extends RepositorySource { 137 | def root: String = pds.getPath 138 | } 139 | 140 | /** 141 | * Repository coming from a bare repository. 142 | * 143 | * @param root root of the repository 144 | * @param pds portable data stream of any repository file (should only be used to 145 | * retrieve the HDFS config) 146 | */ 147 | case class BareRepository(root: String, pds: PortableDataStream) extends RepositorySource 148 | 149 | /** 150 | * Repository coming from a regular repository with a .git directory. 151 | * 152 | * @param root root of the repository 153 | * @param pds portable data stream of any repository file (should only be used to 154 | * retrieve the HDFS config) 155 | */ 156 | case class GitRepository(root: String, pds: PortableDataStream) extends RepositorySource 157 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/rule/AddSourceToAttributes.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.rule 2 | 3 | import org.apache.spark.sql.catalyst.catalog.CatalogTable 4 | import org.apache.spark.sql.catalyst.expressions.AttributeReference 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 6 | import org.apache.spark.sql.catalyst.rules.Rule 7 | import org.apache.spark.sql.execution.datasources.LogicalRelation 8 | import org.apache.spark.sql.sources.BaseRelation 9 | import org.apache.spark.sql.types.MetadataBuilder 10 | import tech.sourced.engine.{GitRelation, MetadataRelation, Sources} 11 | import tech.sourced.engine.compat 12 | 13 | /** 14 | * Rule to assign to an [[AttributeReference]] metadata to identify the table it belongs to. 15 | */ 16 | object AddSourceToAttributes extends Rule[LogicalPlan] { 17 | 18 | /** 19 | * SOURCE is the key used for attach metadata to [[AttributeReference]]s. 20 | */ 21 | private val SOURCE = Sources.SourceKey 22 | 23 | /** @inheritdoc */ 24 | def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { 25 | case compat.LogicalRelation(rel @ GitRelation(_, _, _, schemaSource), 26 | out, 27 | catalogTable) => 28 | withMetadata(rel, schemaSource, out, catalogTable) 29 | 30 | case compat.LogicalRelation( 31 | rel @ MetadataRelation(_, _, _, _, schemaSource), 32 | out, 33 | catalogTable) => 34 | withMetadata(rel, schemaSource, out, catalogTable) 35 | } 36 | 37 | private def withMetadata(relation: BaseRelation, 38 | schemaSource: Option[String], 39 | out: Seq[AttributeReference], 40 | catalogTable: Option[CatalogTable]): LogicalRelation = { 41 | val processedOut = schemaSource match { 42 | case Some(table) => out.map( 43 | _.withMetadata(new MetadataBuilder().putString(SOURCE, table).build() 44 | ).asInstanceOf[AttributeReference] 45 | ) 46 | case None => out 47 | } 48 | 49 | compat.LogicalRelation(relation, processedOut, catalogTable) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/rule/RelationOptimizer.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.rule 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.catalyst.expressions._ 5 | import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} 6 | import org.apache.spark.sql.catalyst.plans.logical.Join 7 | import org.apache.spark.sql.execution.datasources.LogicalRelation 8 | import org.apache.spark.sql.types.{StructField, StructType} 9 | import tech.sourced.engine.Sources 10 | 11 | private[rule] object RelationOptimizer extends Logging { 12 | private val supportedJoinTypes: Seq[JoinType] = Inner :: Nil 13 | 14 | /** 15 | * Reports whether the given join is supported. 16 | * 17 | * @param j join 18 | * @return is supported or not 19 | */ 20 | def isJoinSupported(j: Join): Boolean = supportedJoinTypes.contains(j.joinType) 21 | 22 | /** 23 | * Retrieves all the unsupported conditions in the join. 24 | * 25 | * @param join Join 26 | * @param left left relation 27 | * @param right right relation 28 | * @return unsupported conditions 29 | */ 30 | def getUnsupportedConditions(join: Join, 31 | left: LogicalRelation, 32 | right: LogicalRelation): Set[_] = { 33 | val leftReferences = left.references.baseSet 34 | val rightReferences = right.references.baseSet 35 | val joinReferences = join.references.baseSet 36 | joinReferences -- leftReferences -- rightReferences 37 | } 38 | 39 | /** 40 | * Mixes the two given expressions with the given join function if both exist 41 | * or returns the one that exists otherwise. 42 | * 43 | * @param l left expression 44 | * @param r right expression 45 | * @param joinFunction function used to join them 46 | * @return an optional expression 47 | */ 48 | def mixExpressions(l: Option[Expression], 49 | r: Option[Expression], 50 | joinFunction: (Expression, Expression) => Expression): 51 | Option[Expression] = { 52 | (l, r) match { 53 | case (Some(expr1), Some(expr2)) => Some(joinFunction(expr1, expr2)) 54 | case (None, None) => None 55 | case (le, None) => le 56 | case (None, re) => re 57 | } 58 | } 59 | 60 | /** 61 | * Creates a schema from a list of attributes. 62 | * 63 | * @param attributes list of attributes 64 | * @return resultant schema 65 | */ 66 | def attributesToSchema(attributes: Seq[AttributeReference]): StructType = 67 | StructType( 68 | attributes 69 | .map((a: Attribute) => StructField(a.name, a.dataType, a.nullable, a.metadata)) 70 | .toArray 71 | ) 72 | 73 | /** 74 | * Takes the join conditions, if any, and transforms them to filters, by removing some filters 75 | * that don't make sense because they are already done inside the iterator. 76 | * 77 | * @param expr optional condition to transform 78 | * @return transformed join conditions or none 79 | */ 80 | def joinConditionsToFilters(expr: Option[Expression]): Option[Expression] = expr match { 81 | case Some(e) => 82 | e transformUp { 83 | case Equality( 84 | a: AttributeReference, 85 | b: AttributeReference 86 | ) if isRedundantAttributeFilter(a, b) => 87 | EqualTo(Literal(1), Literal(1)) 88 | 89 | case BinaryOperator(a, Equality(IntegerLiteral(1), IntegerLiteral(1))) => 90 | a 91 | 92 | case BinaryOperator(Equality(IntegerLiteral(1), IntegerLiteral(1)), b) => 93 | b 94 | } match { 95 | case Equality(IntegerLiteral(1), IntegerLiteral(1)) => 96 | None 97 | case finalExpr => 98 | Some(finalExpr) 99 | } 100 | case None => None 101 | } 102 | 103 | /** 104 | * Returns whether the equality between the two given attribute references is redundant 105 | * for a filter (because they are taken care of inside the iterators). 106 | * 107 | * @param a left attribute 108 | * @param b right attribute 109 | * @return is redundant or not 110 | */ 111 | def isRedundantAttributeFilter(a: AttributeReference, b: AttributeReference): Boolean = { 112 | // to avoid case (a, b) and case (b, a) we take left and right sorted by name and source 113 | val (left, right) = a.name.compareTo(b.name) match { 114 | case 0 => 115 | val sourceA = attributeSource(a).getOrElse("") 116 | val sourceB = attributeSource(b).getOrElse("") 117 | if (sourceA.compareTo(sourceB) <= 0) (a, b) else (b, a) 118 | case n if n < 0 => (a, b) 119 | case _ => (b, a) 120 | } 121 | 122 | (attributeQualifiedName(left), attributeQualifiedName(right)) match { 123 | case (("repositories", "id"), ("references", "repository_id")) => true 124 | case (("references", "name"), ("commits", "reference_name")) => true 125 | case (("tree_entries", "commit_hash"), ("commits", "hash")) => true 126 | case (("tree_entries", "blob"), ("blobs", "blob_id")) => true 127 | // source does not matter in these cases 128 | case ((_, "repository_id"), (_, "repository_id")) => true 129 | case ((_, "reference_name"), (_, "reference_name")) => true 130 | case ((_, "commit_hash"), (_, "commit_hash")) => true 131 | case _ => false 132 | } 133 | } 134 | 135 | def attributeSource(a: AttributeReference): Option[String] = 136 | if (a.metadata.contains(Sources.SourceKey)) { 137 | Some(a.metadata.getString(Sources.SourceKey)) 138 | } else { 139 | None 140 | } 141 | 142 | def attributeQualifiedName(a: AttributeReference): (String, String) = 143 | (attributeSource(a).getOrElse(""), a.name) 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/ClassifyLanguagesUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.expressions.UserDefinedFunction 6 | import org.apache.spark.sql.functions.udf 7 | import tech.sourced.enry.Enry 8 | 9 | /** User defined function to guess languages of files. */ 10 | case object ClassifyLanguagesUDF extends CustomUDF with Logging { 11 | 12 | override val name = "classifyLanguages" 13 | 14 | override def apply(session: SparkSession): UserDefinedFunction = 15 | udf[Option[String], Boolean, String, Array[Byte]](getLanguage) 16 | 17 | /** 18 | * Gets the language of the given file and returns the guessed language or none. 19 | * 20 | * @param isBinary whether it's a binary file or not 21 | * @param path file path 22 | * @param content file content 23 | * @return `None` if no language could be guessed, `Some(language)` otherwise. 24 | */ 25 | def getLanguage(isBinary: Boolean, path: String, content: Array[Byte]): Option[String] = { 26 | timer.time({ 27 | if (isBinary) { 28 | None 29 | } else { 30 | val lang = try { 31 | Enry.getLanguage(path, content) 32 | } catch { 33 | case e@(_: RuntimeException | _: Exception) => 34 | log.error(s"get language for file '$path' failed", e) 35 | null 36 | } 37 | if (null == lang || lang.isEmpty) None else Some(lang) 38 | } 39 | }) 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/ConcatArrayUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.expressions.UserDefinedFunction 5 | import org.apache.spark.sql.functions.udf 6 | 7 | 8 | /** User defined function to concat array elements with the given separator. */ 9 | case object ConcatArrayUDF extends CustomUDF { 10 | 11 | override val name = "concatArray" 12 | 13 | override def apply(session: SparkSession): UserDefinedFunction = { 14 | udf[String, Seq[String], String]((arr, sep) => arr.mkString(sep)) 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/CustomUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import org.apache.spark.groupon.metrics.{NotInitializedException, SparkTimer, UserMetricsSystem} 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.expressions.UserDefinedFunction 7 | 8 | /** 9 | * Custom named user defined function. 10 | */ 11 | abstract class CustomUDF { 12 | /** timer intended to be used on UDF logic **/ 13 | lazy protected val timer: SparkTimerUDFWrapper = new SparkTimerUDFWrapper(name) 14 | 15 | /** Name of the function. */ 16 | val name: String 17 | 18 | /** Function to execute when this function is called. */ 19 | def apply(session: SparkSession): UserDefinedFunction 20 | 21 | def apply(): UserDefinedFunction = this.apply(session = null) 22 | } 23 | 24 | sealed class SparkTimerUDFWrapper(name: String) extends Logging { 25 | lazy val timer: SparkTimer = init() 26 | 27 | private def init(): SparkTimer = { 28 | try { 29 | UserMetricsSystem.timer(name) 30 | } catch { 31 | case _: NotInitializedException => { 32 | logWarning("SparkMetric not initialized on UDF") 33 | null 34 | } 35 | } 36 | 37 | } 38 | 39 | def time[T](f: => T): T = 40 | if (timer == null) { 41 | f 42 | } else { 43 | timer.time(f) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/ExtractTokensUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.expressions.UserDefinedFunction 6 | import org.apache.spark.sql.functions.udf 7 | 8 | /** User defined function to extract tokens from an UAST. */ 9 | case object ExtractTokensUDF extends CustomUDF { 10 | 11 | override val name = "extractTokens" 12 | 13 | override def apply(session: SparkSession): UserDefinedFunction = 14 | udf[Seq[String], Seq[Array[Byte]]](extractTokens) 15 | 16 | private def extractTokens(nodes: Seq[Array[Byte]]): Seq[String] = { 17 | timer.time({ 18 | if (nodes == null) { 19 | Seq() 20 | } else { 21 | nodes.map(Node.parseFrom).map(_.token) 22 | } 23 | }) 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/ExtractUASTsUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.expressions.UserDefinedFunction 5 | import org.apache.spark.sql.functions.udf 6 | import tech.sourced.engine.util.Bblfsh 7 | 8 | trait ExtractUASTsUDF { 9 | 10 | def extractUASTs(path: String, 11 | content: Array[Byte], 12 | lang: String = null, 13 | config: Bblfsh.Config): Seq[Array[Byte]] = { 14 | if (content == null || content.isEmpty) { 15 | Seq() 16 | } else { 17 | Bblfsh.extractUAST(path, content, lang, config) 18 | } 19 | } 20 | 21 | } 22 | 23 | /** Common entry point to use extraction UAST UDFs with or without language parameter. */ 24 | case object ExtractUASTsUDF extends CustomUDF with ExtractUASTsUDF { 25 | 26 | override val name = "extractUASTs" 27 | 28 | override def apply(session: SparkSession): UserDefinedFunction = { 29 | val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session)) 30 | udf[Seq[Array[Byte]], String, Array[Byte], String]((path, content, lang) => 31 | extractUASTs(path, content, lang, configB.value)) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/udf/QueryXPathUDF.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.udf 2 | 3 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.expressions.UserDefinedFunction 6 | import org.apache.spark.sql.functions.udf 7 | import tech.sourced.engine.util.Bblfsh 8 | 9 | 10 | /** User defined function to perform XPath queries on UASTs. */ 11 | case object QueryXPathUDF extends CustomUDF { 12 | 13 | override val name = "queryXPath" 14 | 15 | override def apply(session: SparkSession): UserDefinedFunction = { 16 | val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session)) 17 | udf[Seq[Array[Byte]], Seq[Array[Byte]], String]((nodes, query) => 18 | queryXPath(nodes, query, configB.value)) 19 | } 20 | 21 | private def queryXPath(nodes: Seq[Array[Byte]], 22 | query: String, 23 | config: Bblfsh.Config): Seq[Array[Byte]] = { 24 | timer.time({ 25 | if (nodes == null) { 26 | return null 27 | } 28 | 29 | nodes.map(Node.parseFrom).flatMap(n => { 30 | val result = Bblfsh.filter(n, query, config) 31 | if (result == null) { 32 | None 33 | } else { 34 | result.toIterator 35 | } 36 | }).map(_.toByteArray) 37 | }) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/util/Bblfsh.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.util 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import gopkg.in.bblfsh.sdk.v1.protocol.generated.Status 6 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node 7 | import org.apache.spark.internal.Logging 8 | import org.apache.spark.sql.SparkSession 9 | import org.bblfsh.client.BblfshClient 10 | import tech.sourced.engine.util.Bblfsh.client 11 | 12 | object Bblfsh extends Logging { 13 | 14 | case class Config(host: String, port: Int) 15 | 16 | /** Key used for the option to specify the host of the bblfsh grpc service. */ 17 | val hostKey = "spark.tech.sourced.bblfsh.grpc.host" 18 | 19 | /** Key used for the option to specify the port of the bblfsh grpc service. */ 20 | val portKey = "spark.tech.sourced.bblfsh.grpc.port" 21 | 22 | /** Default bblfsh host. */ 23 | val defaultHost = "0.0.0.0" 24 | 25 | /** Default bblfsh port. */ 26 | val defaultPort = 9432 27 | 28 | var supportedLanguages: Set[String] = Set() 29 | 30 | private var config: Config = _ 31 | private var client: BblfshClient = _ 32 | 33 | /** 34 | * Returns the configuration for bblfsh. 35 | * 36 | * @param session Spark session 37 | * @return bblfsh configuration 38 | */ 39 | def getConfig(session: SparkSession): Config = { 40 | if (config == null) { 41 | val host = session.conf.get(hostKey, Bblfsh.defaultHost) 42 | val port = session.conf.get(portKey, Bblfsh.defaultPort.toString).toInt 43 | config = Config(host, port) 44 | } 45 | 46 | config 47 | } 48 | 49 | private def getClient(config: Config): BblfshClient = synchronized { 50 | if (client == null) { 51 | client = BblfshClient(config.host, config.port) 52 | } 53 | 54 | client 55 | } 56 | 57 | private def getSupportedLanguages(config: Config): Set[String] = synchronized { 58 | if (supportedLanguages.isEmpty) { 59 | val client = getClient(config) 60 | supportedLanguages = client.supportedLanguages() 61 | .languages.map(m => m.language) 62 | .toSet 63 | } 64 | 65 | supportedLanguages 66 | } 67 | 68 | private def shouldExtractLanguage(config: Config, lang: String): Boolean = { 69 | val supportedLanguages = getSupportedLanguages(config) 70 | supportedLanguages.contains(lang.toLowerCase()) 71 | } 72 | 73 | /** 74 | * Extracts the UAST using bblfsh. 75 | * 76 | * @param path File path 77 | * @param content File content 78 | * @param lang File language 79 | * @param config bblfsh configuration 80 | * @return List of uast nodes binary-encoded as a byte array 81 | */ 82 | def extractUAST(path: String, 83 | content: Array[Byte], 84 | lang: String, 85 | config: Config): Seq[Array[Byte]] = { 86 | //FIXME(bzz): not everything is UTF-8 encoded :/ 87 | // if lang == null, it hasn't been classified yet 88 | // so rely on bblfsh to guess this file's language 89 | if (lang != null && !shouldExtractLanguage(config, lang)) { 90 | Seq() 91 | } else { 92 | val client = getClient(config) 93 | val contentStr = new String(content, StandardCharsets.UTF_8) 94 | val parsed = client.parse(path, content = contentStr, lang = lang) 95 | if (parsed.status == Status.OK) { 96 | Seq(parsed.uast.get.toByteArray) 97 | } else { 98 | logWarning(s"${parsed.status} $path: ${parsed.errors.mkString("; ")}") 99 | Seq() 100 | } 101 | } 102 | } 103 | 104 | /** 105 | * Filter an UAST node using the given query. 106 | * 107 | * @param node An UAST node 108 | * @param query XPath expression 109 | * @param config bblfsh configuration 110 | * @return UAST list of filtered nodes 111 | */ 112 | def filter(node: Node, query: String, config: Config): List[Node] = { 113 | getClient(config).filter(node, query) 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/util/GitUrlsParser.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.util 2 | 3 | import java.net.{URI, URISyntaxException} 4 | 5 | object GitUrlsParser { 6 | private val isGit = """(.+)\@(.+):(.+)\.git""".r 7 | 8 | /** 9 | * Retrieves the URL that will act as identifier in a list of URLs 10 | * for a repository. 11 | * 12 | * @param urls array of urls 13 | * @return processed id 14 | */ 15 | def getIdFromUrls(urls: Array[String]): String = { 16 | urls.flatMap({ 17 | case isGit(_, host, path, _*) => 18 | Some(s"$host/$path") 19 | case s => try { 20 | val u: URI = new URI(s) 21 | Some(u.getHost + u.getPath) 22 | } catch { 23 | case _: URISyntaxException => None 24 | } 25 | }).distinct.min 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/tech/sourced/engine/util/MD5Gen.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.util 2 | 3 | import java.security.MessageDigest 4 | import javax.xml.bind.annotation.adapters.HexBinaryAdapter 5 | 6 | /** 7 | * Convenience wrapper around java [[java.security.MessageDigest]] for easier md5 hashing. 8 | */ 9 | object MD5Gen { 10 | private val ba = new HexBinaryAdapter() 11 | 12 | /** 13 | * Hashes the given string using md5. 14 | * 15 | * @param s string to hash 16 | * @return hashed string 17 | */ 18 | def str(s: String): String = synchronized { 19 | ba.marshal(MessageDigest.getInstance("MD5").digest(s.getBytes())) 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /src/test/resources/bad-siva-files/0a0bfaa46954437548fbaeb0e19237f84e968511.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/bad-siva-files/0a0bfaa46954437548fbaeb0e19237f84e968511.siva -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Define the root logger with appender 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=ERROR 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.tech.sourced.engine.iterator.BlobIterator=ERROR 12 | log4j.logger.tech.sourced.engine.provider.RepositoryProvider=ERROR 13 | -------------------------------------------------------------------------------- /src/test/resources/siva-files/05893125684f2d3943cd84a7ab2b75e53668fba1.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/05893125684f2d3943cd84a7ab2b75e53668fba1.siva -------------------------------------------------------------------------------- /src/test/resources/siva-files/ff/fff840f8784ef162dc83a1465fc5763d890b68ba.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/ff/fff840f8784ef162dc83a1465fc5763d890b68ba.siva -------------------------------------------------------------------------------- /src/test/resources/siva-files/fff7062de8474d10a67d417ccea87ba6f58ca81d.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/fff7062de8474d10a67d417ccea87ba6f58ca81d.siva -------------------------------------------------------------------------------- /src/test/resources/siva-files/not-siva.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/not-siva.txt -------------------------------------------------------------------------------- /src/test/resources/zip-slip-siva-files/git-zipslip.siva: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/zip-slip-siva-files/git-zipslip.siva -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/BaseSivaSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | trait BaseSivaSpec { 4 | val resourcePath: String = getClass.getResource("/siva-files").toString 5 | } 6 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/BaseSourceSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.apache.spark.SparkException 4 | import org.scalatest._ 5 | 6 | class BaseSourceSpec(source: String = "BaseSourceSpec") 7 | extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec with BeforeAndAfterEach { 8 | 9 | var engine: Engine = _ 10 | 11 | override protected def beforeEach(): Unit = { 12 | super.beforeAll() 13 | 14 | engine = Engine(ss, resourcePath, "siva") 15 | } 16 | 17 | source should "get heads of all repositories and count the files" in { 18 | val df = engine.getRepositories 19 | .getHEAD 20 | .getCommits 21 | .getTreeEntries 22 | .getBlobs 23 | .select("commit_hash", "path", "content", "is_binary") 24 | .distinct() 25 | df.count should be(457) 26 | } 27 | 28 | it should "count all the commit messages from all masters that are not forks" in { 29 | val commits = engine.getRepositories.filter("is_fork = false").getMaster.getAllReferenceCommits 30 | val df = commits.select("message").filter(commits("message").startsWith("a")) 31 | df.count should be(7) 32 | } 33 | 34 | it should "count all commits messages from all references that are not forks" in { 35 | val commits = engine.getRepositories.filter("is_fork = false").getReferences 36 | .getAllReferenceCommits 37 | val df = commits.select("message", "reference_name", "hash"). 38 | filter(commits("message").startsWith("a")) 39 | df.count should be(98) 40 | } 41 | 42 | it should "get all files from HEADS that are Ruby" in { 43 | val blobs = engine.getRepositories.filter("is_fork = false") 44 | .getHEAD 45 | .getCommits 46 | .getTreeEntries 47 | .getBlobs 48 | .classifyLanguages 49 | val df = blobs.filter(blobs("lang") === "Ruby").select("lang", "path") 50 | df.count should be(169) 51 | } 52 | 53 | it should "get all tree entries" in { 54 | val df = engine.getRepositories.getReferences.getAllReferenceCommits.getTreeEntries 55 | df.count() should be(304362) 56 | } 57 | 58 | it should "filter by reference from repos dataframe" in { 59 | val spark = ss 60 | 61 | val df = Engine(spark, resourcePath, "siva") 62 | .getRepositories 63 | .getReference("refs/heads/develop") 64 | assert(df.count == 2) 65 | } 66 | 67 | "Filter by HEAD reference" should "return only HEAD references" in { 68 | val spark = ss 69 | val df = Engine(spark, resourcePath, "siva").getRepositories.getHEAD 70 | assert(df.count == 5) 71 | } 72 | 73 | "Filter by master reference" should "return only master references" in { 74 | val df = engine.getRepositories.getMaster 75 | assert(df.count == 5) 76 | } 77 | 78 | "Get develop commits" should "return only develop commits" in { 79 | val df = engine.getRepositories 80 | .getReference("refs/heads/develop").getAllReferenceCommits 81 | .select("hash", "repository_id") 82 | assert(df.count == 103) 83 | } 84 | 85 | "Get files after reading commits" should "return the correct files" in { 86 | val files = engine.getRepositories 87 | .getReferences 88 | .getAllReferenceCommits 89 | .getBlobs 90 | .drop("repository_id", "reference_name") 91 | .distinct() 92 | 93 | assert(files.count == 91944) 94 | } 95 | 96 | "Get files without reading tree entries" should "return the correct files" in { 97 | val files = engine.getRepositories 98 | .getReferences 99 | .getAllReferenceCommits 100 | .getBlobs 101 | .drop("repository_id", "reference_name") 102 | .distinct() 103 | 104 | assert(files.count == 91944) 105 | } 106 | 107 | "Get files" should "return the correct files" in { 108 | val df = engine.getRepositories.getHEAD.getAllReferenceCommits 109 | .sort("hash").limit(10) 110 | val rows = df.collect() 111 | .map(row => (row.getString(row.fieldIndex("repository_id")), 112 | row.getString(row.fieldIndex("hash")))) 113 | val repositories = rows.map(_._1) 114 | val hashes = rows.map(_._2) 115 | 116 | val files = engine 117 | .getBlobs(repositories.distinct, List("refs/heads/HEAD"), hashes.distinct) 118 | .drop("repository_id", "reference_name") 119 | .distinct() 120 | 121 | assert(files.count == 655) 122 | } 123 | 124 | it should "return the correct files if we filter by repository" in { 125 | val files = engine 126 | .getBlobs(repositoryIds = List("github.com/xiyou-linuxer/faq-xiyoulinux")) 127 | .drop("repository_id", "reference_name") 128 | .distinct() 129 | 130 | assert(files.count == 2421) 131 | } 132 | 133 | it should "return the correct files if we filter by reference" in { 134 | val files = engine 135 | .getBlobs(referenceNames = List("refs/heads/develop")) 136 | .drop("repository_id", "reference_name") 137 | .distinct() 138 | 139 | assert(files.count == 425) 140 | } 141 | 142 | it should "return the correct files if we filter by commit" in { 143 | val files = engine 144 | .getBlobs(commitHashes = List("fff7062de8474d10a67d417ccea87ba6f58ca81d")) 145 | .drop("repository_id", "reference_name") 146 | .distinct() 147 | assert(files.count == 2) 148 | } 149 | 150 | override protected def afterEach(): Unit = { 151 | super.afterEach() 152 | 153 | engine = _: Engine 154 | } 155 | 156 | } 157 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/BaseSparkSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterAll, Suite} 5 | 6 | trait BaseSparkSpec extends BeforeAndAfterAll { 7 | this: Suite => 8 | 9 | var ss: SparkSession = _ 10 | 11 | override protected def beforeAll(): Unit = { 12 | super.beforeAll() 13 | ss = SparkSession.builder() 14 | .appName("test").master("local[*]") 15 | .config("spark.driver.host", "localhost") 16 | .getOrCreate() 17 | ss.registerUDFs() 18 | } 19 | 20 | override protected def afterAll(): Unit = { 21 | super.afterAll() 22 | ss = null 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/DefaultSourceSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import java.nio.file.{Path, Paths} 4 | import java.util.UUID 5 | 6 | import org.apache.commons.io.FileUtils 7 | import org.eclipse.jgit.api.Git 8 | 9 | class DefaultSourceSpec extends BaseSourceSpec("DefaultSource") { 10 | 11 | var tmpPath: Path = Paths.get(System.getProperty("java.io.tmpdir"), UUID.randomUUID.toString) 12 | 13 | override protected def beforeAll(): Unit = { 14 | super.beforeAll() 15 | 16 | tmpPath.toFile.mkdir() 17 | } 18 | 19 | "DefaultSource" should "not optimize if the conditions on the " + 20 | "join are not the expected ones" in { 21 | val repos = engine.getRepositories 22 | val references = ss.read.format("tech.sourced.engine").option("table", "references").load() 23 | val out = repos.join(references, 24 | (references("repository_id") === repos("id")) 25 | .and(references("name").startsWith("refs/pull")) 26 | ).count() 27 | 28 | val df = references.limit(1).getCommits 29 | df.count() should be(1) 30 | } 31 | 32 | it should "return the remote branches renamed to refs/heads" in { 33 | val repoDir = tmpPath.resolve("repo") 34 | 35 | Git.cloneRepository() 36 | .setURI("https://github.com/src-d/jgit-spark-connector.git") 37 | .setDirectory(repoDir.toFile) 38 | .call() 39 | 40 | val engine = Engine(ss, tmpPath.toString, "standard") 41 | val masters = engine.getRepositories 42 | .getMaster 43 | .collect() 44 | .sortBy(_.getAs[String]("repository_id")) 45 | 46 | masters.length should be(2) 47 | masters(0).getAs[String]("repository_id") should startWith("file") 48 | masters(0).getAs[Boolean]("is_remote") should be(false) 49 | 50 | masters(1).getAs[String]("repository_id") should startWith("github") 51 | masters(1).getAs[Boolean]("is_remote") should be(true) 52 | 53 | engine.getRepositories.getRemoteReferences.getMaster.count() should be(1) 54 | } 55 | 56 | it should "match HEAD and not just refs/heads/HEAD" in { 57 | val repoDir = tmpPath.resolve("repo") 58 | 59 | import tech.sourced.engine.util.RepoUtils._ 60 | 61 | val repo = createRepo(repoDir) 62 | commitFile(repo, "foo", "bar", "baz") 63 | 64 | Engine(ss, tmpPath.toString, "standard").getRepositories.getHEAD.count() should be(1) 65 | } 66 | 67 | it should "traverse all commits if it's not chained" in { 68 | val row = engine.session.sql("SELECT COUNT(*) FROM commits").first() 69 | row(0) should be(4444) 70 | 71 | val row2 = engine.session.sql("SELECT COUNT(*) FROM commits WHERE index > 0").first() 72 | row2(0) should be(4390) 73 | } 74 | 75 | override protected def afterAll(): Unit = { 76 | super.afterAll() 77 | 78 | FileUtils.deleteQuietly(tmpPath.toFile) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/EngineSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import java.nio.file.{Path, Paths} 4 | import java.util.{Properties, UUID} 5 | 6 | import org.apache.commons.io.FileUtils 7 | import org.apache.spark.SparkException 8 | import org.scalatest.{FlatSpec, Matchers} 9 | 10 | class EngineSpec extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec { 11 | 12 | var engine: Engine = _ 13 | var tmpPath: Path = _ 14 | 15 | override protected def beforeAll(): Unit = { 16 | super.beforeAll() 17 | 18 | engine = Engine(ss, resourcePath, "siva") 19 | tmpPath = Paths.get(System.getProperty("java.io.tmpdir")) 20 | .resolve(UUID.randomUUID.toString) 21 | tmpPath.toFile.mkdir() 22 | } 23 | 24 | override def afterAll(): Unit = { 25 | super.afterAll() 26 | FileUtils.deleteQuietly(tmpPath.toFile) 27 | } 28 | 29 | "saveMetadata" should "store all metadata tables in a SQLite db" in { 30 | engine.saveMetadata(tmpPath.toString) 31 | 32 | val dbFile = tmpPath.resolve("engine_metadata.db") 33 | dbFile.toFile.exists should be(true) 34 | 35 | val properties = new Properties() 36 | properties.put("driver", "org.sqlite.JDBC") 37 | 38 | val reposDf = engine.getRepositories 39 | val refsDf = reposDf.getReferences 40 | val repoHasCommitsDf = refsDf.getAllReferenceCommits 41 | .select("reference_name", "repository_id", "hash", "index") 42 | val commitsDf = refsDf.getAllReferenceCommits 43 | .drop("index", "reference_name", "repository_id") 44 | .distinct() 45 | val treeEntriesDf = refsDf.getAllReferenceCommits.getTreeEntries 46 | .drop("reference_name", "repository_id") 47 | .distinct() 48 | 49 | Seq( 50 | (RepositoriesTable, reposDf), 51 | (ReferencesTable, refsDf), 52 | (RepositoryHasCommitsTable, repoHasCommitsDf), 53 | (CommitsTable, commitsDf), 54 | (TreeEntriesTable, treeEntriesDf) 55 | ).foreach { 56 | case (table, df) => 57 | val count = df.count() 58 | ss.read.jdbc(s"jdbc:sqlite:$dbFile", Tables.prefix(table), properties) 59 | .count() should be(count) 60 | } 61 | } 62 | 63 | "skipReadErrors" should "skip all read errors" in { 64 | val resourcePath = getClass.getResource("/bad-siva-files").toString 65 | val engine = Engine(ss, resourcePath, "siva").skipReadErrors(true) 66 | val tmpPath = Paths.get(System.getProperty("java.io.tmpdir")) 67 | .resolve(UUID.randomUUID.toString) 68 | tmpPath.toFile.mkdir() 69 | 70 | val cnt = engine 71 | .getRepositories 72 | .getReferences 73 | .getCommits 74 | .getTreeEntries 75 | .getBlobs 76 | .count() 77 | 78 | cnt should be(8663) 79 | 80 | FileUtils.deleteQuietly(tmpPath.toFile) 81 | } 82 | 83 | "engine" should "throw an error when a siva file contains a zip-slip vulnerability" in { 84 | val resourcePath = getClass.getResource("/zip-slip-siva-files").toString 85 | val engine = Engine(ss, resourcePath, "siva") 86 | val tmpPath = Paths.get(System.getProperty("java.io.tmpdir")) 87 | .resolve(UUID.randomUUID.toString) 88 | tmpPath.toFile.mkdir() 89 | 90 | val ex = intercept[SparkException] { 91 | engine 92 | .getRepositories 93 | .getReferences 94 | .getCommits 95 | .getTreeEntries 96 | .getBlobs 97 | .count() 98 | } 99 | 100 | ex.getCause.getMessage should be("Entry is outside of the target dir: objects/../../imoutside") 101 | 102 | FileUtils.deleteQuietly(tmpPath.toFile) 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/FilterUDFSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.scalatest.{FlatSpec, Matchers} 4 | 5 | class FilterUDFSpec extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec { 6 | 7 | var engine: Engine = _ 8 | 9 | override protected def beforeAll(): Unit = { 10 | super.beforeAll() 11 | engine = Engine(ss, resourcePath, "siva") 12 | } 13 | 14 | "Filter by language" should "work properly" in { 15 | val langDf = engine 16 | .getRepositories 17 | .getReferences 18 | .getCommits 19 | .getBlobs 20 | .classifyLanguages 21 | 22 | val filteredLang = langDf.select("repository_id", "path", "lang").where("lang='Python'") 23 | filteredLang.count() should be(6) 24 | } 25 | 26 | override protected def afterAll(): Unit = { 27 | super.afterAll() 28 | engine = _: Engine 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/MetadataSourceSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import java.nio.file.{Path, Paths} 4 | import java.util.UUID 5 | 6 | import org.apache.commons.io.FileUtils 7 | 8 | class MetadataSourceSpec extends BaseSourceSpec("MetadataSource") { 9 | 10 | private var tmpDir: Path = Paths.get( 11 | System.getProperty("java.io.tmpdir"), 12 | UUID.randomUUID().toString 13 | ) 14 | 15 | override protected def beforeAll(): Unit = { 16 | super.beforeAll() 17 | 18 | tmpDir.toFile.mkdir() 19 | 20 | engine = Engine(ss, resourcePath, "siva") 21 | engine.saveMetadata(tmpDir.toString) 22 | engine = engine.fromMetadata(tmpDir.toString) 23 | } 24 | 25 | override protected def afterAll(): Unit = { 26 | super.afterAll() 27 | FileUtils.deleteQuietly(tmpDir.toFile) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/QueryBuilderSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import java.sql.{Date, Timestamp} 4 | 5 | import org.apache.spark.sql.catalyst.expressions._ 6 | import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType} 7 | import org.apache.spark.unsafe.types.UTF8String 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import QueryBuilder._ 10 | 11 | class QueryBuilderSpec extends FlatSpec with Matchers { 12 | 13 | "QueryBuilder.qualify" should "qualify and quote col" in { 14 | val expected = s"${prefixTable("foo")}.`bar`" 15 | qualify("foo", "bar") should be(expected) 16 | qualify(attr("foo", "bar")) should be(expected) 17 | } 18 | 19 | "QueryBuilder.compileValue" should "return compiled value" in { 20 | val now = System.currentTimeMillis 21 | val cases = Seq( 22 | (UTF8String.fromString("foo"), "'foo'"), 23 | ("fo'o", "'fo''o'"), 24 | (new Timestamp(now), s"'${new Timestamp(now)}'"), 25 | (new Date(now), s"'${new Date(now)}'"), 26 | (Seq("a", 1, true), "'a', 1, 1"), 27 | (true, 1), 28 | (false, 0) 29 | ) 30 | 31 | cases.foreach { 32 | case (input, expected) => 33 | compileValue(input) should be(expected) 34 | } 35 | } 36 | 37 | "QueryBuilder.compileFilter" should "compile the filters to SQL" in { 38 | val col = qualify("foo", "bar") 39 | val cases = Seq( 40 | (EqualTo(attr("foo", "bar"), Literal(1, IntegerType)), 41 | s"$col = 1"), 42 | (EqualNullSafe(attr("foo", "bar"), Literal(1, IntegerType)), 43 | s"(NOT ($col != 1 OR $col IS NULL OR 1 IS NULL) OR ($col IS NULL AND 1 IS NULL))"), 44 | (LessThan(attr("foo", "bar"), Literal(1, IntegerType)), 45 | s"$col < 1"), 46 | (GreaterThan(attr("foo", "bar"), Literal(1, IntegerType)), 47 | s"$col > 1"), 48 | (LessThanOrEqual(attr("foo", "bar"), Literal(1, IntegerType)), 49 | s"$col <= 1"), 50 | (GreaterThanOrEqual(attr("foo", "bar"), Literal(1, IntegerType)), 51 | s"$col >= 1"), 52 | (IsNull(attr("foo", "bar")), s"$col IS NULL"), 53 | (IsNotNull(attr("foo", "bar")), s"$col IS NOT NULL"), 54 | (In(attr("foo", "bar"), Seq()), s"CASE WHEN $col IS NULL THEN NULL ELSE FALSE END"), 55 | (In(attr("foo", "bar"), Seq(Literal(1, IntegerType), Literal(2, IntegerType))), 56 | s"$col IN (1, 2)"), 57 | (Not(EqualTo(attr("foo", "bar"), Literal(1, IntegerType))), 58 | s"(NOT ($col = 1))"), 59 | (Or(EqualTo(attr("foo", "bar"), Literal(1, IntegerType)), 60 | EqualTo(attr("foo", "bar"), Literal(2, IntegerType)) 61 | ), 62 | s"(($col = 1) OR ($col = 2))"), 63 | (And(EqualTo(attr("foo", "bar"), Literal(1, IntegerType)), 64 | EqualTo(attr("foo", "bar"), Literal(2, IntegerType)) 65 | ), 66 | s"($col = 1) AND ($col = 2)") 67 | ) 68 | 69 | cases.foreach { 70 | case (expr, expected) => 71 | compileFilter(expr).get should be(expected) 72 | } 73 | } 74 | 75 | "QueryBuilder.selectedFields" should "return SQL for selected tables" in { 76 | QueryBuilder(tables = Seq("repositories")) 77 | .selectedFields should be(s"${qualify("repositories", "id")}") 78 | 79 | QueryBuilder(fields = Seq( 80 | attr("repositories", "id"), 81 | attr("references", "name") 82 | )).selectedFields should be( 83 | s"${qualify("repositories", "id")}, ${qualify("references", "name")}" 84 | ) 85 | } 86 | 87 | "QueryBuilder.whereClause" should "return SQL for where clause" in { 88 | QueryBuilder().whereClause should be("") 89 | 90 | QueryBuilder(filters = Seq( 91 | EqualTo(attr("foo", "bar"), Literal(1, IntegerType)) 92 | )).whereClause should be(s"WHERE ${qualify("foo", "bar")} = 1") 93 | 94 | QueryBuilder(filters = Seq( 95 | EqualTo(attr("foo", "bar"), Literal(1, IntegerType)), 96 | EqualTo(attr("foo", "baz"), Literal(2, IntegerType)) 97 | )).whereClause should be(s"WHERE ${qualify("foo", "bar")} = 1 AND ${qualify("foo", "baz")} = 2") 98 | } 99 | 100 | "QueryBuilder.selectedTables" should "return SQL for selected tables" in { 101 | QueryBuilder(tables = Seq("repositories")) 102 | .selectedTables should be(s"${prefixTable("repositories")}") 103 | 104 | QueryBuilder(joins = Seq( 105 | Join("repositories", "references", Seq( 106 | JoinCondition("repositories", "id", "references", "repository_id") 107 | )), 108 | Join("references", "commits", Seq( 109 | JoinCondition("references", "repository_id", "commits", "repository_id"), 110 | JoinCondition("references", "name", "commits", "reference_name") 111 | )) 112 | )).selectedTables should be(s"${prefixTable("repositories")} INNER JOIN " + 113 | s"${prefixTable("references")} ON (" + 114 | s"${qualify("repositories", "id")} = ${qualify("references", "repository_id")}) INNER JOIN " + 115 | s"${prefixTable("commits")} ON (${qualify("references", "repository_id")} = " + 116 | s"${qualify("commits", "repository_id")} AND ${qualify("references", "name")} = " + 117 | s"${qualify("commits", "reference_name")})") 118 | } 119 | 120 | "QueryBuilder.sql" should "return SQL for the query" in { 121 | QueryBuilder( 122 | fields = Seq(attr("repositories", "id")), 123 | tables = Seq("repositories"), 124 | filters = Seq(EqualTo(attr("repositories", "id"), Literal("foo", StringType))) 125 | ).sql should be(s"SELECT ${qualify("repositories", "id")} " + 126 | s"FROM ${prefixTable("repositories")} " + 127 | s"WHERE ${qualify("repositories", "id")} = ${compileValue("foo")}") 128 | } 129 | 130 | def attr(table: String, name: String): Attribute = 131 | AttributeReference( 132 | name, 133 | StringType, 134 | nullable = false, 135 | new MetadataBuilder().putString(Sources.SourceKey, table).build() 136 | )() 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/StorageLevelSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine 2 | 3 | import org.scalatest.{FlatSpec, Matchers} 4 | 5 | class StorageLevelSpec extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec { 6 | 7 | var engine: Engine = _ 8 | 9 | override protected def beforeAll(): Unit = { 10 | super.beforeAll() 11 | engine = Engine(ss, resourcePath, "siva") 12 | } 13 | 14 | "A Dataframe" should "work with all storage levels" in { 15 | import org.apache.spark.storage.StorageLevel._ 16 | val storageLevels = List( 17 | DISK_ONLY, 18 | DISK_ONLY_2, 19 | MEMORY_AND_DISK, 20 | MEMORY_AND_DISK_2, 21 | MEMORY_AND_DISK_SER, 22 | MEMORY_AND_DISK_SER_2, 23 | MEMORY_ONLY, 24 | MEMORY_ONLY_2, 25 | MEMORY_ONLY_SER, 26 | MEMORY_ONLY_SER_2, 27 | NONE, 28 | OFF_HEAP 29 | ) 30 | 31 | storageLevels.foreach(level => { 32 | val df = engine.getRepositories.persist(level) 33 | df.count() 34 | df.unpersist() 35 | }) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/iterator/BaseChainableIterator.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.eclipse.jgit.lib.Repository 6 | import org.scalatest.{Matchers, Suite} 7 | import tech.sourced.engine.provider.{RepositoryProvider, RepositorySource, RepositoryRDDProvider} 8 | import tech.sourced.engine.{BaseSivaSpec, BaseSparkSpec} 9 | 10 | trait BaseChainableIterator extends Suite with BaseSparkSpec with BaseSivaSpec with Matchers { 11 | override def afterAll(): Unit = { 12 | super.afterAll() 13 | provider.close(source, repo) 14 | } 15 | 16 | lazy val prov: RepositoryRDDProvider = RepositoryRDDProvider(ss.sparkContext) 17 | lazy val rdd: RDD[RepositorySource] = prov.get(resourcePath, RepositoryRDDProvider.SivaFormat) 18 | 19 | lazy val source: RepositorySource = rdd.filter(source => source.pds.getPath() 20 | .endsWith("fff7062de8474d10a67d417ccea87ba6f58ca81d.siva")).first() 21 | lazy val provider: RepositoryProvider = RepositoryProvider("/tmp") 22 | lazy val repo: Repository = provider.get(source) 23 | 24 | def testIterator(iterator: (Repository) => Iterator[Row], 25 | matcher: (Int, Row) => Unit, 26 | total: Int, 27 | columnsCount: Int): Unit = { 28 | val ri: Iterator[Row] = iterator(repo) 29 | 30 | var count: Int = 0 31 | while (ri.hasNext) { 32 | val row: Row = ri.next() 33 | row.length should be(columnsCount) 34 | matcher(count, row) 35 | count += 1 36 | } 37 | 38 | count should be(total) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/iterator/GitTreeEntryIteratorSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.scalatest.FlatSpec 4 | import tech.sourced.engine.util.{Attr, EqualFilter, NotFilter} 5 | 6 | class GitTreeEntryIteratorSpec extends FlatSpec with BaseChainableIterator { 7 | 8 | private val cols = Array( 9 | "commit_hash", 10 | "repository_id", 11 | "reference_name", 12 | "path", 13 | "blob" 14 | ) 15 | 16 | private val allCommitsFilter = NotFilter(EqualFilter(Attr("index", "commits"), -1)) 17 | 18 | "GitTreeEntryIterator" should "return all tree entries from all commits " + 19 | "from all repositories into a siva file" in { 20 | testIterator(repo => 21 | new GitTreeEntryIterator( 22 | cols, 23 | repo, 24 | new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false), 25 | Seq(), 26 | false 27 | ), { 28 | case (0, row) => 29 | row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 30 | row.getString(1) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 31 | row.getString(2) should be("refs/heads/HEAD") 32 | row.getString(3) should be("LICENSE") 33 | row.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c") 34 | case (1, row) => 35 | row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 36 | row.getString(1) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 37 | row.getString(2) should be("refs/heads/HEAD") 38 | row.getString(3) should be("README.md") 39 | row.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126") 40 | case (2, row) => 41 | row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 42 | row.getString(1) should be("github.com/mawag/faq-xiyoulinux") 43 | row.getString(2) should be("refs/heads/HEAD") 44 | row.getString(3) should be("LICENSE") 45 | row.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c") 46 | case (3, row) => 47 | row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 48 | row.getString(1) should be("github.com/mawag/faq-xiyoulinux") 49 | row.getString(2) should be("refs/heads/HEAD") 50 | row.getString(3) should be("README.md") 51 | row.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126") 52 | case _ => 53 | }, total = 23189, columnsCount = cols.length 54 | ) 55 | } 56 | 57 | it should "filter by path" in { 58 | val filters = Seq(EqualFilter( 59 | Attr("path", "tree_entries"), 60 | "README.md") 61 | ) 62 | 63 | testIterator(repo => 64 | new GitTreeEntryIterator( 65 | cols, 66 | repo, 67 | new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false), 68 | filters, 69 | false 70 | ), { 71 | case (_, r) => 72 | r.getString(3) should be("README.md") 73 | }, total = 1062, columnsCount = cols.length 74 | ) 75 | } 76 | 77 | it should "filter by blob" in { 78 | val filters = Seq(EqualFilter( 79 | Attr("blob", "tree_entries"), 80 | "733c072369ca77331f392c40da7404c85c36542c") 81 | ) 82 | 83 | testIterator(repo => 84 | new GitTreeEntryIterator( 85 | cols, 86 | repo, 87 | new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false), 88 | filters, 89 | false 90 | ), { 91 | case (_, r) => 92 | r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c") 93 | }, total = 1062, columnsCount = cols.length 94 | ) 95 | } 96 | 97 | it should "work when it's chained" in { 98 | val filters = Seq(EqualFilter( 99 | Attr("hash", "commits"), 100 | "fff7062de8474d10a67d417ccea87ba6f58ca81d"), 101 | allCommitsFilter 102 | ) 103 | 104 | testIterator(repo => 105 | new GitTreeEntryIterator( 106 | cols, 107 | repo, 108 | new CommitIterator(Array("hash"), repo, null, filters, false), 109 | Seq(), 110 | false 111 | ), { 112 | case (i, r) if i % 2 == 0 => 113 | r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c") 114 | r.getString(3) should be("LICENSE") 115 | r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 116 | 117 | case (_, r) => 118 | r.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126") 119 | r.getString(3) should be("README.md") 120 | r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 121 | }, total = 86, columnsCount = cols.length 122 | ) 123 | } 124 | 125 | it should "filter by commit hash" in { 126 | val filters = Seq(EqualFilter( 127 | Attr("commit_hash", "tree_entries"), 128 | "fff7062de8474d10a67d417ccea87ba6f58ca81d") 129 | ) 130 | 131 | testIterator(repo => 132 | new GitTreeEntryIterator( 133 | cols, 134 | repo, 135 | new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false), 136 | filters, 137 | false 138 | ), { 139 | case (i, r) if i % 2 == 0 => 140 | r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c") 141 | r.getString(3) should be("LICENSE") 142 | r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 143 | 144 | case (_, r) => 145 | r.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126") 146 | r.getString(3) should be("README.md") 147 | r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 148 | }, total = 86, columnsCount = cols.length 149 | ) 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/iterator/MetadataIteratorSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import java.nio.file.Paths 4 | import java.util.{Properties, UUID} 5 | 6 | import org.apache.commons.io.FileUtils 7 | import org.apache.spark.sql.Row 8 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} 9 | import org.apache.spark.sql.types.{Metadata, StringType, StructType} 10 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 11 | import tech.sourced.engine.{BaseSparkSpec, Schema} 12 | 13 | class JDBCQueryIteratorSpec 14 | extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec { 15 | private val tmpPath = Paths.get( 16 | System.getProperty("java.io.tmpdir"), 17 | UUID.randomUUID.toString 18 | ) 19 | 20 | private val dbPath = tmpPath.resolve("test.db") 21 | 22 | override def beforeAll(): Unit = { 23 | super.beforeAll() 24 | tmpPath.toFile.mkdir() 25 | val rdd = ss.sparkContext.parallelize(Seq( 26 | Row("id1"), 27 | Row("id2"), 28 | Row("id3") 29 | )) 30 | 31 | val properties = new Properties() 32 | properties.put("driver", "org.sqlite.JDBC") 33 | val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head))) 34 | df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties) 35 | } 36 | 37 | override def afterAll(): Unit = { 38 | super.afterAll() 39 | FileUtils.deleteQuietly(tmpPath.toFile) 40 | } 41 | 42 | "JDBCQueryIterator" should "return all rows for the query" in { 43 | val iter = new JDBCQueryIterator( 44 | Seq(attr("id")), 45 | dbPath.toString, 46 | "SELECT id FROM repositories ORDER BY id" 47 | ) 48 | 49 | // calling hasNext more than one time does not cause rows to be lost 50 | iter.hasNext 51 | iter.hasNext 52 | val rows = (for (row <- iter) yield row).toArray 53 | rows.length should be(3) 54 | rows(0).length should be(1) 55 | rows(0)(0).toString should be("id1") 56 | rows(1)(0).toString should be("id2") 57 | rows(2)(0).toString should be("id3") 58 | } 59 | 60 | private def attr(name: String): Attribute = AttributeReference( 61 | name, StringType, nullable = false, Metadata.empty 62 | )() 63 | } 64 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/iterator/ReferenceIteratorSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import org.scalatest.FlatSpec 4 | import tech.sourced.engine.util.{Attr, EqualFilter} 5 | 6 | class ReferenceIteratorSpec extends FlatSpec with BaseChainableIterator { 7 | 8 | "ReferenceIterator" should "return all references from all repositories into a siva file" in { 9 | testIterator( 10 | new ReferenceIterator(Array("repository_id", "name", "hash"), _, null, Seq(), false), { 11 | case (0, row) => 12 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 13 | row.getString(1) should be("refs/heads/HEAD") 14 | row.getString(2) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 15 | case (1, row) => 16 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 17 | row.getString(1) should be("refs/heads/HEAD") 18 | row.getString(2) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d") 19 | case (2, row) => 20 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 21 | row.getString(1) should be("refs/heads/develop") 22 | row.getString(2) should be("880653c14945dbbc915f1145561ed3df3ebaf168") 23 | case _ => 24 | }, total = 43, columnsCount = 3 25 | ) 26 | } 27 | 28 | it should "return only specified columns" in { 29 | testIterator( 30 | new ReferenceIterator(Array("repository_id", "name"), _, null, Seq(), false), { 31 | case (0, row) => 32 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 33 | row.getString(1) should be("refs/heads/HEAD") 34 | case (1, row) => 35 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 36 | row.getString(1) should be("refs/heads/HEAD") 37 | case (2, row) => 38 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 39 | row.getString(1) should be("refs/heads/develop") 40 | case _ => 41 | }, total = 43, columnsCount = 2 42 | ) 43 | } 44 | 45 | it should "apply passed filters" in { 46 | testIterator( 47 | new ReferenceIterator( 48 | Array("repository_id", "name"), 49 | _, 50 | null, 51 | Seq(EqualFilter(Attr("name", "references"), "refs/heads/develop")), 52 | false 53 | ), { 54 | case (0, row) => 55 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 56 | row.getString(1) should be("refs/heads/develop") 57 | case (1, row) => 58 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 59 | row.getString(1) should be("refs/heads/develop") 60 | }, total = 2, columnsCount = 2 61 | ) 62 | } 63 | 64 | it should "use previously passed iterator" in { 65 | testIterator(repo => 66 | new ReferenceIterator( 67 | Array("repository_id", "name"), 68 | repo, 69 | new RepositoryIterator( 70 | "/foo/bar", 71 | Array("id"), 72 | repo, 73 | Seq(EqualFilter(Attr("id", "repository"), "github.com/xiyou-linuxer/faq-xiyoulinux")), 74 | false 75 | ), 76 | Seq(EqualFilter(Attr("name", "references"), "refs/heads/develop")), 77 | false 78 | ), { 79 | case (0, row) => 80 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 81 | row.getString(1) should be("refs/heads/develop") 82 | }, total = 1, columnsCount = 2 83 | ) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/iterator/RepositoryIteratorSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.iterator 2 | 3 | import java.nio.file.Paths 4 | import java.util.UUID 5 | 6 | import org.apache.commons.io.FileUtils 7 | import org.scalatest.{BeforeAndAfterEach, FlatSpec} 8 | import tech.sourced.engine.provider.{RepositoryProvider, RepositoryRDDProvider} 9 | import tech.sourced.engine.util.{Attr, EqualFilter} 10 | 11 | class RepositoryIteratorSpec extends FlatSpec with BaseChainableIterator with BeforeAndAfterEach { 12 | 13 | private var tmpDir: java.nio.file.Path = _ 14 | 15 | override def beforeEach(): Unit = { 16 | super.beforeEach() 17 | tmpDir = Paths.get(System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString) 18 | tmpDir.toFile.mkdir() 19 | } 20 | 21 | override def afterEach(): Unit = { 22 | super.afterEach() 23 | FileUtils.deleteQuietly(tmpDir.toFile) 24 | } 25 | 26 | "RepositoryIterator" should "return data for all repositories into a siva file" in { 27 | testIterator( 28 | new RepositoryIterator( 29 | "/foo/bar", 30 | Array("id", "urls", "is_fork", "repository_path"), 31 | _, 32 | Seq(), 33 | false 34 | ), { 35 | case (0, row) => 36 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 37 | row.getAs[Array[String]](1).length should be(3) 38 | row.getBoolean(2) should be(false) 39 | row.getString(3) should be("/foo/bar") 40 | case (1, row) => 41 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 42 | row.getAs[Array[String]](1).length should be(3) 43 | row.getBoolean(2) should be(true) 44 | row.getString(3) should be("/foo/bar") 45 | case (c, _) => fail(s"unexpected row number: $c") 46 | }, total = 2, columnsCount = 4 47 | ) 48 | } 49 | 50 | it should "return only specified columns" in { 51 | testIterator( 52 | new RepositoryIterator("/foo/bar", Array("id", "is_fork"), _, Seq(), false), { 53 | case (0, row) => 54 | row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux") 55 | row.getBoolean(1) should be(false) 56 | case (1, row) => 57 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 58 | row.getBoolean(1) should be(true) 59 | case (c, _) => fail(s"unexpected row number: $c") 60 | }, total = 2, columnsCount = 2 61 | ) 62 | } 63 | 64 | it should "apply passed filters" in { 65 | testIterator( 66 | new RepositoryIterator( 67 | "/foo/bar", 68 | Array("id", "is_fork"), 69 | _, 70 | Seq(EqualFilter(Attr("id", "repository"), "github.com/mawag/faq-xiyoulinux")), 71 | false 72 | ), { 73 | case (0, row) => 74 | row.getString(0) should be("github.com/mawag/faq-xiyoulinux") 75 | row.getBoolean(1) should be(true) 76 | case (c, _) => fail(s"unexpected row number: $c") 77 | }, total = 1, columnsCount = 2 78 | ) 79 | } 80 | 81 | it should "return a repository for each distinct remote and the local dir" in { 82 | import tech.sourced.engine.util.RepoUtils._ 83 | 84 | val gitRepo = createRepo(tmpDir.resolve("repo")) 85 | 86 | addRemote(gitRepo, "repo", "git@github.com:git/repo.git") 87 | 88 | FileUtils.write(tmpDir.resolve("repo").resolve("README.md").toFile, "hello world") 89 | gitRepo.add().addFilepattern("README.md").call() 90 | gitRepo.commit().setMessage("first commit on regular repo").call() 91 | 92 | val rdd = RepositoryRDDProvider(ss.sparkContext) 93 | .get(tmpDir.toString, RepositoryRDDProvider.StandardFormat) 94 | val source = rdd.first() 95 | val provider = RepositoryProvider(tmpDir.toString) 96 | val repo = provider.get(source) 97 | 98 | val iter = new RepositoryIterator("/foo/bar", Array("id"), repo, Seq(), false) 99 | val repos = iter.toList 100 | 101 | repos.length should be(2) 102 | repos.head(0).toString should be("github.com/git/repo") 103 | repos(1)(0).toString should startWith("file://") 104 | 105 | provider.close(source, repo) 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/provider/RepositoryRDDProviderSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.provider 2 | 3 | import java.nio.file.{Path, Paths} 4 | import java.util.UUID 5 | 6 | import org.apache.commons.io.FileUtils 7 | import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} 8 | import tech.sourced.engine.util.RepoUtils 9 | import tech.sourced.engine.{BaseSivaSpec, BaseSparkSpec} 10 | 11 | class RepositoryRDDProviderSpec extends FlatSpec with Matchers with BeforeAndAfterEach 12 | with BaseSparkSpec with BaseSivaSpec { 13 | 14 | private var provider: RepositoryRDDProvider = _ 15 | private var tmpPath: Path = _ 16 | 17 | override def beforeEach(): Unit = { 18 | super.beforeEach() 19 | provider = RepositoryRDDProvider(ss.sparkContext) 20 | tmpPath = Paths.get( 21 | System.getProperty("java.io.tmpdir"), 22 | UUID.randomUUID().toString 23 | ) 24 | } 25 | 26 | override def afterEach(): Unit = { 27 | super.afterEach() 28 | 29 | FileUtils.deleteQuietly(tmpPath.toFile) 30 | } 31 | 32 | "RepositoryRDDProvider" should "retrieve bucketized raw repositories" in { 33 | tmpPath.resolve("a").toFile.mkdir() 34 | createRepo(tmpPath.resolve("a").resolve("repo")) 35 | 36 | tmpPath.resolve("b").toFile.mkdir() 37 | createRepo(tmpPath.resolve("b").resolve("repo")) 38 | 39 | createRepo(tmpPath.resolve("repo")) 40 | 41 | val repos = provider.get(tmpPath.toString, "standard").collect() 42 | repos.length should be(3) 43 | } 44 | 45 | it should "retrieve non-bucketized raw repositories" in { 46 | tmpPath.resolve("a").toFile.mkdir() 47 | createRepo(tmpPath.resolve("repo")) 48 | 49 | tmpPath.resolve("b").toFile.mkdir() 50 | createRepo(tmpPath.resolve("repo2")) 51 | 52 | val repos = provider.get(tmpPath.toString, "standard").collect() 53 | repos.length should be(2) 54 | } 55 | 56 | it should "retrieve bucketized siva repositories" in { 57 | val repos = provider.get(resourcePath, "siva").collect() 58 | repos.length should be(3) 59 | } 60 | 61 | it should "retrieve non-bucketized siva repositories" in { 62 | val repos = provider.get(Paths.get(resourcePath, "ff").toString, "siva").collect() 63 | repos.length should be(1) 64 | } 65 | 66 | private def createRepo(path: Path) = { 67 | val repo = RepoUtils.createRepo(path) 68 | RepoUtils.commitFile(repo, "file.txt", "something something", "some commit") 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/util/FilterSpec.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.util 2 | 3 | import org.apache.spark.sql.catalyst.expressions._ 4 | import org.apache.spark.sql.types.StringType 5 | import org.scalatest.{FlatSpec, Matchers} 6 | 7 | class FilterSpec extends FlatSpec with Matchers { 8 | "CompiledFilters" should "filter properly depending of his type" in { 9 | val eq = EqualFilter(Attr("test", ""), "a") 10 | 11 | eq.eval("a") should be(true) 12 | eq.eval("b") should be(false) 13 | 14 | val notEq = NotFilter(EqualFilter(Attr("test", ""), "a")) 15 | 16 | notEq.eval("a") should be(false) 17 | notEq.eval("b") should be(true) 18 | 19 | val in = InFilter(Attr("test", ""), Array("a", "b", "c")) 20 | 21 | in.eval("a") should be(true) 22 | in.eval("b") should be(true) 23 | in.eval("c") should be(true) 24 | in.eval("d") should be(false) 25 | 26 | val gt = GreaterThanFilter(Attr("test", ""), 5) 27 | 28 | gt.eval(4) should be(false) 29 | gt.eval(5) should be(false) 30 | gt.eval(6) should be(true) 31 | 32 | val gte = GreaterThanOrEqualFilter(Attr("test", ""), 5) 33 | 34 | gte.eval(4) should be(false) 35 | gte.eval(5) should be(true) 36 | gte.eval(6) should be(true) 37 | 38 | val lt = LessThanFilter(Attr("test", ""), 5) 39 | 40 | lt.eval(4) should be(true) 41 | lt.eval(5) should be(false) 42 | lt.eval(6) should be(false) 43 | 44 | val lte = LessThanOrEqualFilter(Attr("test", ""), 5) 45 | 46 | lte.eval(4) should be(true) 47 | lte.eval(5) should be(true) 48 | lte.eval(6) should be(false) 49 | } 50 | 51 | "ColumnFilter" should "process correctly columns" in { 52 | // test = 'val' AND test IS NOT NULL AND test2 = 'val2' AND test3 IN ('a', 'b') 53 | val f = Filter.compile(And( 54 | And( 55 | And( 56 | EqualTo(AttributeReference("test", StringType)(), Literal("val")), 57 | IsNotNull(AttributeReference("test", StringType)()) 58 | ), 59 | EqualTo(AttributeReference("test2", StringType)(), Literal("val2")) 60 | ), 61 | In(AttributeReference("test3", StringType)(), Seq(Literal("a"), Literal("b"))) 62 | )) 63 | 64 | f.length should be(4) 65 | val filters = Filters(f) 66 | filters.matches(Seq("test"), "val") should be(true) 67 | filters.matches(Seq("test2"), "val") should be(false) 68 | filters.matches(Seq("test3"), "b") should be(true) 69 | } 70 | 71 | "ColumnFilter" should "handle correctly unsupported filters" in { 72 | val f = Filter.compile(StartsWith(AttributeReference("test", StringType)(), Literal("a"))) 73 | 74 | f.length should be(0) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/test/scala/tech/sourced/engine/util/RepoUtils.scala: -------------------------------------------------------------------------------- 1 | package tech.sourced.engine.util 2 | 3 | import java.nio.file.{Path, Paths} 4 | 5 | import org.apache.commons.io.FileUtils 6 | import org.eclipse.jgit.api.CreateBranchCommand.SetupUpstreamMode 7 | import org.eclipse.jgit.api.Git 8 | import org.eclipse.jgit.revwalk.RevCommit 9 | import org.eclipse.jgit.transport.URIish 10 | 11 | object RepoUtils { 12 | 13 | def createBareRepo(path: Path): Git = { 14 | Git.init().setBare(true).setDirectory(path.toFile).call() 15 | } 16 | 17 | def createRepo(path: Path): Git = { 18 | Git.init().setDirectory(path.toFile).call() 19 | } 20 | 21 | def addRemote(repo: Git, name: String, url: String): Unit = { 22 | val cmd = repo.remoteAdd() 23 | cmd.setName(name) 24 | cmd.setUri(new URIish(url)) 25 | cmd.call() 26 | } 27 | 28 | def commitFile(repo: Git, name: String, content: String, msg: String): RevCommit = { 29 | val file = Paths.get(repo.getRepository.getDirectory.getParent, name) 30 | FileUtils.write(file.toFile, content) 31 | repo.add().addFilepattern(name).call() 32 | repo.commit().setMessage(msg).call() 33 | } 34 | 35 | } 36 | --------------------------------------------------------------------------------