├── .dockerignore
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DCO
├── Dockerfile
├── ISSUE_TEMPLATE.md
├── LICENSE
├── MAINTAINERS
├── Makefile
├── README.md
├── _examples
    ├── README.md
    ├── notebooks
    │   └── Example.ipynb
    ├── pyspark
    │   ├── pyspark-shell-basic.md
    │   ├── pyspark-shell-classifying-languages.md
    │   ├── pyspark-shell-lang-and-uast.md
    │   ├── pyspark-shell-raw-repositories.md
    │   ├── pyspark-shell-schemas.md
    │   ├── pyspark-shell-uast-extraction.md
    │   └── pyspark-shell-xpath-query.md
    ├── scala
    │   ├── spark-shell-basic.md
    │   ├── spark-shell-classifying-languages.md
    │   ├── spark-shell-lang-and-uast.md
    │   ├── spark-shell-raw-repositories.md
    │   ├── spark-shell-schemas.md
    │   ├── spark-shell-uast-extraction.md
    │   └── spark-shell-xpath-query.md
    └── siva-files
    │   ├── 2d58138f24fa863c235b0c33158b870a40c79ee2.siva
    │   ├── 5d4a8bf30c0da7209f651632b62a362620556c85.siva
    │   └── aac052c42c501abf6aa8c3509424e837bb27e188.siva
├── build.sbt
├── documentation
    └── proposals
    │   ├── ENIP-000.md
    │   ├── ENIP-001.md
    │   ├── ENIP-002.md
    │   ├── ENIP-003.md
    │   ├── ENIP-004.md
    │   └── README.md
├── key.asc.enc
├── project
    ├── Dependencies.scala
    ├── build.properties
    └── plugins.sbt
├── python
    ├── .gitignore
    ├── LICENSE.txt
    ├── MANIFEST.in
    ├── Makefile
    ├── README.rst
    ├── setup.cfg
    ├── setup.py
    ├── sourced
    │   ├── __init__.py
    │   ├── engine
    │   │   ├── __init__.py
    │   │   └── engine.py
    │   └── examples
    │   │   ├── __init__.py
    │   │   ├── basic.py
    │   │   ├── repo_files.py
    │   │   ├── repo_references.py
    │   │   ├── repos.py
    │   │   └── uasts.py
    └── test
    │   ├── __init__.py
    │   ├── base.py
    │   ├── test_engine.py
    │   └── test_sourced_dataframe.py
├── sbt
├── scalastyle-config.xml
└── src
    ├── main
        └── scala
        │   ├── org
        │       └── apache
        │       │   └── spark
        │       │       └── UtilsWrapper.scala
        │   └── tech
        │       └── sourced
        │           └── engine
        │               ├── DefaultSource.scala
        │               ├── Engine.scala
        │               ├── MetadataSource.scala
        │               ├── QueryBuilder.scala
        │               ├── Schema.scala
        │               ├── Sources.scala
        │               ├── TableBuilder.scala
        │               ├── compat
        │                   └── compat.scala
        │               ├── exception
        │                   └── RepositoryException.scala
        │               ├── iterator
        │                   ├── BlobIterator.scala
        │                   ├── ChainableIterator.scala
        │                   ├── CleanupIterator.scala
        │                   ├── CommitIterator.scala
        │                   ├── GitTreeEntryIterator.scala
        │                   ├── MetadataIterator.scala
        │                   ├── ReferenceIterator.scala
        │                   ├── RepositoryIterator.scala
        │                   └── RootedRepo.scala
        │               ├── package.scala
        │               ├── provider
        │                   ├── ReadOnlyFileRepository.scala
        │                   ├── RepositoryProvider.scala
        │                   └── RepositoryRDDProvider.scala
        │               ├── rule
        │                   ├── AddSourceToAttributes.scala
        │                   ├── RelationOptimizer.scala
        │                   ├── SquashGitRelationsJoin.scala
        │                   └── SquashMetadataRelationsJoin.scala
        │               ├── udf
        │                   ├── ClassifyLanguagesUDF.scala
        │                   ├── ConcatArrayUDF.scala
        │                   ├── CustomUDF.scala
        │                   ├── ExtractTokensUDF.scala
        │                   ├── ExtractUASTsUDF.scala
        │                   └── QueryXPathUDF.scala
        │               └── util
        │                   ├── Bblfsh.scala
        │                   ├── Filters.scala
        │                   ├── GitUrlsParser.scala
        │                   └── MD5Gen.scala
    └── test
        ├── resources
            ├── bad-siva-files
            │   └── 0a0bfaa46954437548fbaeb0e19237f84e968511.siva
            ├── log4j.properties
            ├── siva-files
            │   ├── 05893125684f2d3943cd84a7ab2b75e53668fba1.siva
            │   ├── ff
            │   │   └── fff840f8784ef162dc83a1465fc5763d890b68ba.siva
            │   ├── fff7062de8474d10a67d417ccea87ba6f58ca81d.siva
            │   └── not-siva.txt
            └── zip-slip-siva-files
            │   └── git-zipslip.siva
        └── scala
            └── tech
                └── sourced
                    └── engine
                        ├── BaseSivaSpec.scala
                        ├── BaseSourceSpec.scala
                        ├── BaseSparkSpec.scala
                        ├── DefaultSourceSpec.scala
                        ├── EngineSpec.scala
                        ├── FilterUDFSpec.scala
                        ├── MetadataSourceSpec.scala
                        ├── QueryBuilderSpec.scala
                        ├── StorageLevelSpec.scala
                        ├── iterator
                            ├── BaseChainableIterator.scala
                            ├── BlobIteratorSpec.scala
                            ├── CommitIteratorSpec.scala
                            ├── GitTreeEntryIteratorSpec.scala
                            ├── MetadataIteratorSpec.scala
                            ├── ReferenceIteratorSpec.scala
                            └── RepositoryIteratorSpec.scala
                        ├── provider
                            ├── RepositoryProviderSpec.scala
                            └── RepositoryRDDProviderSpec.scala
                        ├── udf
                            └── CustomUDFSpec.scala
                        └── util
                            ├── FilterSpec.scala
                            └── RepoUtils.scala


/.dockerignore:
--------------------------------------------------------------------------------
1 | # Exclude all directories and files except those used by Dockerfile to build the image
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | target/*
 4 | project/target/*
 5 | project/project/*
 6 | .idea
 7 | .docsrv-resources
 8 | /bin/
 9 | .cache*
10 | .#*
11 | .project
12 | .settings
13 | key.asc
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | sudo: required
 3 | 
 4 | language: scala
 5 | scala: 2.11.11
 6 | 
 7 | cache:
 8 |   directories:
 9 |     - $HOME/.sbt
10 |     - $HOME/.ivy2
11 | 
12 | services:
13 |   - docker
14 | 
15 | install:
16 |   - |
17 |     set -e
18 |     if [[ ${LANGUAGE} = python ]]; then
19 |       sudo apt install libxml2-dev curl build-essential
20 |       make build
21 |       cd python
22 |       pip install -e .
23 |     fi
24 | 
25 | before_script:
26 |   - make -f "$TRAVIS_BUILD_DIR/Makefile" docker-bblfsh
27 |   - make -f "$TRAVIS_BUILD_DIR/Makefile" docker-bblfsh-install-drivers
28 | 
29 | script:
30 |   - if [[ ${LANGUAGE} = python ]]; then make test ;fi
31 |   - |
32 |     set -e
33 |     if [[ ${LANGUAGE} = java ]]; then
34 |       make travis-test
35 |       bash <(curl -s https://codecov.io/bash)
36 |     fi
37 | 
38 | jobs:
39 |   include:
40 |     - {env: 'LANGUAGE=java   SPARK_VERSION=2.2.1', jdk: openjdk8}
41 |     - {env: 'LANGUAGE=java   SPARK_VERSION=2.3.1', jdk: openjdk8}
42 |     - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.4, language: python}
43 |     - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.5, language: python}
44 |     - {env: 'LANGUAGE=python SPARK_VERSION=2.2.1', python: 3.6, language: python}
45 |     - {env: 'LANGUAGE=python SPARK_VERSION=2.3.1', python: 3.6, language: python}
46 | 
47 |     - stage: deploy
48 |       if: tag IS present OR (branch = master AND env(TRAVIS_PULL_REQUEST) IS present)
49 |       jdk: openjdk8
50 | 
51 |       install: skip
52 |       before_script: skip
53 | 
54 |       script:
55 |         - openssl aes-256-cbc -K $encrypted_8a9ac81f2640_key -iv $encrypted_8a9ac81f2640_iv -in key.asc.enc -out key.asc -d
56 |         - gpg --no-default-keyring --primary-keyring ./project/.gnupg/pubring.gpg --secret-keyring ./project/.gnupg/secring.gpg --keyring ./project/.gnupg/pubring.gpg --fingerprint --import key.asc
57 |         - make build
58 |         - cp target/jgit-spark-connector-uber.jar "jgit-spark-connector-$TRAVIS_TAG.jar"
59 |         - make docker-push
60 | 
61 |       deploy:
62 |         - provider: script
63 |           script: make maven-release
64 |           skip_cleanup: true
65 |           on:
66 |             tags: true
67 |         - provider: releases
68 |           api_key:
69 |             secure: $GITHUB_TOKEN
70 |           file_glob: true
71 |           file: "*.jar"
72 |           skip_cleanup: true
73 |           on:
74 |             tags: true
75 | 
76 |     - if: tag IS present
77 |       language: python
78 |       python: 3.6
79 | 
80 |       script:
81 |         - sudo apt install libxml2-dev curl build-essential
82 |         - make build
83 |         - cd python
84 |         - pip install -e .
85 |         - echo "$TRAVIS_TAG" | cut -c 2- > version.txt
86 | 
87 |       deploy:
88 |         - provider: pypi
89 |           user: $PYPI_USERNAME
90 |           password: $PYPI_PASSWORD
91 |           skip_cleanup: true
92 |           on:
93 |             tags: true
94 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at conduct@sourced.tech. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 |  
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 660 York Street, Suite 102,
 6 | San Francisco, CA 94110 USA
 7 |  
 8 | Everyone is permitted to copy and distribute verbatim copies of this
 9 | license document, but changing it is not allowed.
10 |  
11 |  
12 | Developer's Certificate of Origin 1.1
13 |  
14 | By making a contribution to this project, I certify that:
15 |  
16 | (a) The contribution was created in whole or in part by me and I
17 |     have the right to submit it under the open source license
18 |     indicated in the file; or
19 |  
20 | (b) The contribution is based upon previous work that, to the best
21 |     of my knowledge, is covered under an appropriate open source
22 |     license and I have the right under that license to submit that
23 |     work with modifications, whether created in whole or in part
24 |     by me, under the same open source license (unless I am
25 |     permitted to submit under a different license), as indicated
26 |     in the file; or
27 |  
28 | (c) The contribution was provided directly to me by some other
29 |     person who certified (a), (b) or (c) and I have not modified
30 |     it.
31 |  
32 | (d) I understand and agree that this project and the contribution
33 |     are public and that a record of the contribution (including all
34 |     personal information I submit with it, including my sign-off) is
35 |     maintained indefinitely and may be redistributed consistent with
36 |     this project or the open source license(s) involved.
37 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jdk as builder
 2 | RUN apt-get update && apt-get install -y --no-install-recommends git
 3 | RUN mkdir /jgit-spark-connector
 4 | WORKDIR /jgit-spark-connector
 5 | COPY . /jgit-spark-connector
 6 | RUN ./sbt assembly
 7 | 
 8 | FROM srcd/jupyter-spark:5.2.1
 9 | 
10 | RUN mkdir -p /opt/
11 | 
12 | # jgit-spark-connector jar location
13 | ENV SPARK_DRIVER_EXTRA_CLASSPATH spark.driver.extraClassPath
14 | ENV SPARK_EXECUTOR_EXTRA_CLASSPATH spark.executor.extraClassPath
15 | ENV SRCD_JAR /opt/jars/jgit-spark-connector-uber.jar
16 | 
17 | # bblfsh endpoint variables
18 | ENV SPARK_BBLFSH_HOST spark.tech.sourced.bblfsh.grpc.host
19 | ENV BBLFSH_HOST bblfshd
20 | ENV SPARK_BBLFSH_PORT spark.tech.sourced.bblfsh.grpc.port
21 | ENV BBLFSH_PORT 9432
22 | 
23 | USER root
24 | 
25 | RUN apt-get update && \
26 |     apt-get install -y --no-install-suggests --no-install-recommends locales curl g++ libxml2-dev && \
27 |     apt-get clean && \
28 |     locale-gen en_US.UTF-8
29 | 
30 | ENV LANG en_US.UTF-8
31 | 
32 | COPY ./python /opt/python-jgit-spark-connector/
33 | COPY ./_examples/notebooks/* /home/$NB_USER/
34 | COPY --from=builder /jgit-spark-connector/target/jgit-spark-connector-uber.jar /opt/jars/
35 | 
36 | 
37 | RUN echo "local" > /opt/python-jgit-spark-connector/version.txt \
38 |     && pip install -e /opt/python-jgit-spark-connector/ \
39 |     && pip install jupyter-spark \
40 |     && jupyter serverextension enable --py jupyter_spark \
41 |     && jupyter nbextension install --py jupyter_spark \
42 |     && jupyter nbextension enable --py jupyter_spark \
43 |     && jupyter nbextension enable --py widgetsnbextension
44 | 
45 | # Separate the config file in a different RUN creation as this may change more often
46 | RUN echo "$SPARK_DRIVER_EXTRA_CLASSPATH $SRCD_JAR\n$SPARK_EXECUTOR_EXTRA_CLASSPATH $SRCD_JAR" >> /usr/local/spark/conf/spark-defaults.conf \
47 |     && echo "$SPARK_BBLFSH_HOST $BBLFSH_HOST\n$SPARK_BBLFSH_PORT $BBLFSH_PORT" >> /usr/local/spark/conf/spark-defaults.conf
48 | 
49 | # Disable jupyter token
50 | RUN mkdir -p /root/.jupyter && \
51 |     echo "c.NotebookApp.token = ''" > ~/.jupyter/jupyter_notebook_config.py && \
52 |     echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py && \
53 |     echo "c.NotebookApp.notebook_dir = '/home'" >> ~/.jupyter/jupyter_notebook_config.py && \
54 |     echo "c.NotebookApp.port = 8080" >> ~/.jupyter/jupyter_notebook_config.py
55 | 
56 | 


--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Your issue may already be reported!
 2 | Please search on the [issue track](../) before creating one.
 3 | 
 4 | ## Expected Behavior
 5 | <!--- If you're describing a bug, tell us what should happen -->
 6 | <!--- If you're suggesting a change/improvement, tell us how it should work -->
 7 | 
 8 | ## Current Behavior
 9 | <!--- If describing a bug, tell us what happens instead of the expected behavior -->
10 | <!--- If suggesting a change/improvement, explain the difference from current behavior -->
11 | 
12 | ## Possible Solution
13 | <!--- Not mandatory, but suggest a fix/reason for the bug, -->
14 | <!--- or ideas how to implement the addition or change -->
15 | 
16 | ## Steps to Reproduce (for bugs)
17 | <!--- Provide a link to a live example, or an unambiguous set of steps to -->
18 | <!--- reproduce this bug. Include code to reproduce, if relevant -->
19 | 1.
20 | 2.
21 | 3.
22 | 4.
23 | 
24 | ## Context
25 | <!--- How has this issue affected you? What are you trying to accomplish? -->
26 | <!--- Providing context helps us come up with a solution that is most useful in the real world -->
27 | 
28 | ## Your Environment (for bugs)
29 | <!--- MANDATORY for bugs: Include as many relevant details about the environment you experienced the bug in -->
30 | * Spark version:
31 | * PySpark version (if using PySpark):
32 | * jgit-spark-connector version:
33 | * Operating System and version:
34 | * Some needed resources to reproduce the problem:
35 | 


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Antonio Navarro Perez <antonio@sourced.tech> (@ajnavarro)
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Docsrv: configure the languages whose api-doc can be auto generated
  2 | LANGUAGES = "go scala python"
  3 | # Docsrv: configure the directory containing the python sources
  4 | PYTHON_MAIN_DIR ?= ./python
  5 | # Docs: do not edit this
  6 | DOCS_REPOSITORY := https://github.com/src-d/docs
  7 | SHARED_PATH ?= $(shell pwd)/.docsrv-resources
  8 | DOCS_PATH ?= $(SHARED_PATH)/.docs
  9 | $(DOCS_PATH)/Makefile.inc:
 10 | 	git clone --quiet --depth 1 $(DOCS_REPOSITORY) $(DOCS_PATH);
 11 | -include $(DOCS_PATH)/Makefile.inc
 12 | 
 13 | # Docker
 14 | DOCKER_CMD = docker
 15 | DOCKER_BUILD = $(DOCKER_CMD) build
 16 | DOCKER_TAG ?= $(DOCKER_CMD) tag
 17 | DOCKER_PUSH ?= $(DOCKER_CMD) push
 18 | DOCKER_RUN = $(DOCKER_CMD) run
 19 | DOCKER_RMI = $(DOCKER_CMD) rmi -f
 20 | DOCKER_EXEC = $(DOCKER_CMD) exec
 21 | 
 22 | # Docker run bblfsh server container
 23 | BBLFSH_CONTAINER_NAME = bblfshd
 24 | BBLFSH_HOST_PORT = 9432
 25 | BBLFSH_CONTAINER_PORT = 9432
 26 | BBLFSH_HOST_VOLUME = /var/lib/bblfshd
 27 | BBLFSH_CONTAINER_VOLUME = /var/lib/bblfshd
 28 | BBLFSH_IMAGE = bblfsh/bblfshd
 29 | BBLFSH_VERSION = v2.5.0
 30 | 
 31 | BBLFSH_RUN_FLAGS := --detach --name $(BBLFSH_CONTAINER_NAME) --privileged \
 32 | 	-p $(BBLFSH_HOST_PORT):$(BBLFSH_CONTAINER_PORT) \
 33 | 	-v $(BBLFSH_HOST_VOLUME):$(BBLFSH_CONTAINER_VOLUME) \
 34 | 	$(BBLFSH_IMAGE):$(BBLFSH_VERSION)
 35 | 
 36 | BBLFSH_EXEC_FLAGS = -it
 37 | BBLFSH_CTL = bblfshctl
 38 | BBLFSH_CTL_DRIVER := $(BBLFSH_CTL) driver
 39 | 
 40 | BBLFSH_CTL_LIST_DRIVERS := $(BBLFSH_CTL_DRIVER) list
 41 | BBLFSH_EXEC_LIST_COMMAND := $(BBLFSH_CONTAINER_NAME) bblfshctl driver list
 42 | BBLFSH_LIST_DRIVERS := $(BBLFSH_EXEC_FLAGS) $(BBLFSH_EXEC_LIST_COMMAND)
 43 | 
 44 | 
 45 | # escape_docker_tag escape colon char to allow use a docker tag as rule
 46 | define escape_docker_tag
 47 | $(subst :,--,$(1))
 48 | endef
 49 | 
 50 | # unescape_docker_tag an escaped docker tag to be use in a docker command
 51 | define unescape_docker_tag
 52 | $(subst --,:,$(1))
 53 | endef
 54 | 
 55 | # Docker jupyter image tag
 56 | GIT_COMMIT=$(shell git rev-parse HEAD | cut -c1-7)
 57 | GIT_DIRTY=
 58 | ifneq ($(shell git status --porcelain), )
 59 | 	GIT_DIRTY := -dirty
 60 | endif
 61 | DEV_PREFIX := dev
 62 | VERSION ?= $(DEV_PREFIX)-$(GIT_COMMIT)$(GIT_DIRTY)
 63 | 
 64 | # Docker jupyter image
 65 | JUPYTER_IMAGE ?= srcd/jgit-spark-connector-jupyter
 66 | JUPYTER_IMAGE_VERSIONED ?= $(call escape_docker_tag,$(JUPYTER_IMAGE):$(VERSION))
 67 | 
 68 | # Docker run jupyter container
 69 | JUPYTER_CONTAINER_NAME = jgit-spark-connector-jupyter
 70 | JUPYTER_HOST_PORT = 8080
 71 | JUPYTER_CONTAINER_PORT = 8080
 72 | REPOSITORIES_HOST_DIR := $(PWD)/_examples/siva-files
 73 | REPOSITORIES_CONTAINER_DIR = /repositories
 74 | JUPYTER_RUN_FLAGS := --name $(JUPYTER_CONTAINER_NAME) --rm -it \
 75 | 	-p $(JUPYTER_HOST_PORT):$(JUPYTER_CONTAINER_PORT) \
 76 | 	-v $(REPOSITORIES_HOST_DIR):$(REPOSITORIES_CONTAINER_DIR) \
 77 | 	--link $(BBLFSH_CONTAINER_NAME):$(BBLFSH_CONTAINER_NAME) \
 78 | 	$(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED))
 79 | 
 80 | # Versions
 81 | SCALA_VERSION ?= 2.11.11
 82 | SPARK_VERSION ?= 2.2.1
 83 | 
 84 | # if TRAVIS_SCALA_VERSION defined SCALA_VERSION is overrided
 85 | ifneq ($(TRAVIS_SCALA_VERSION), )
 86 | 	SCALA_VERSION := $(TRAVIS_SCALA_VERSION)
 87 | endif
 88 | 
 89 | # if TRAVIS_TAG defined VERSION is overrided
 90 | ifneq ($(TRAVIS_TAG), )
 91 | 	VERSION := $(TRAVIS_TAG)
 92 | endif
 93 | 
 94 | # if we are not in master, and it's not a tag the push is disabled
 95 | ifneq ($(TRAVIS_BRANCH), master)
 96 | 	ifeq ($(TRAVIS_TAG), )
 97 |         pushdisabled = "push disabled for non-master branches"
 98 | 	endif
 99 | endif
100 | 
101 | # if this is a pull request, the push is disabled
102 | ifneq ($(TRAVIS_PULL_REQUEST), false)
103 |         pushdisabled = "push disabled for pull-requests"
104 | endif
105 | 
106 | #SBT
107 | SBT = ./sbt ++$(SCALA_VERSION) -Dspark.version=$(SPARK_VERSION)
108 | 
109 | # Rules
110 | all: clean build
111 | 
112 | clean:
113 | 	$(SBT) clean
114 | 
115 | test:
116 | 	$(SBT) test
117 | 
118 | build:
119 | 	$(SBT) assembly
120 | 
121 | travis-test:
122 | 	$(SBT) clean coverage test coverageReport scalastyle test:scalastyle
123 | 
124 | docker-bblfsh:
125 | 	$(DOCKER_RUN) $(BBLFSH_RUN_FLAGS)
126 | 
127 | docker-bblfsh-install-drivers:
128 | 	$(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install go bblfsh/go-driver:v0.4.0
129 | 	$(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install python bblfsh/python-driver:v2.0.0
130 | 	$(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install java bblfsh/java-driver:v1.2.6
131 | 	$(DOCKER_EXEC) $(BBLFSH_CONTAINER_NAME) bblfshctl driver install ruby bblfsh/ruby-driver:v2.0.0
132 | 
133 | docker-bblfsh-list-drivers:
134 | 	$(DOCKER_EXEC) $(BBLFSH_LIST_DRIVERS)
135 | 
136 | docker-build:
137 | 	$(if $(pushdisabled),$(error $(pushdisabled)))
138 | 
139 | 	$(DOCKER_BUILD) -t $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) .
140 | 
141 | docker-run:
142 | 	$(DOCKER_RUN) $(JUPYTER_RUN_FLAGS)
143 | 
144 | docker-clean:
145 | 	$(DOCKER_RMI) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED))
146 | 
147 | docker-push: docker-build
148 | 	$(if $(pushdisabled),$(error $(pushdisabled)))
149 | 
150 | 	@if [ "$$DOCKER_USERNAME" != "" ]; then \
151 | 		$(DOCKER_CMD) login -u="$$DOCKER_USERNAME" -p="$$DOCKER_PASSWORD"; \
152 | 	fi;
153 | 
154 | 	$(DOCKER_PUSH) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED))
155 | 	@if [ "$$TRAVIS_TAG" != "" ]; then \
156 | 		$(DOCKER_TAG) $(call unescape_docker_tag,$(JUPYTER_IMAGE_VERSIONED)) \
157 | 			$(call unescape_docker_tag,$(JUPYTER_IMAGE)):latest; \
158 | 		$(DOCKER_PUSH) $(call unescape_docker_tag,$(JUPYTER_IMAGE):latest); \
159 | 	fi;
160 | 
161 | maven-release:
162 | 	$(SBT) clean publishSigned && \
163 | 	$(SBT) sonatypeRelease
164 | 


--------------------------------------------------------------------------------
/_examples/README.md:
--------------------------------------------------------------------------------
 1 | # jgit-spark-connector
 2 | 
 3 | Here you can find a list of annotated *jgit-spark-connector* examples:
 4 | 
 5 | ### pyspark
 6 | 
 7 | - [pyspark's shell  basic example](pyspark/pyspark-shell-basic.md)
 8 | 
 9 | - [pyspark's shell UAST extraction](pyspark/pyspark-shell-uast-extraction.md)
10 | 
11 | - [pyspark's shell classifying languages](pyspark/pyspark-shell-classifying-languages.md)
12 | 
13 | - [pyspark's shell data schemas](pyspark/pyspark-shell-schemas.md)
14 | 
15 | - [pyspark's shell classifying languages and extracting UASTs](pyspark/pyspark-shell-lang-and-uast.md)
16 | 
17 | - [pyspark's shell querying UASTs with XPath](pyspark/pyspark-shell-xpath-query.md)
18 | 
19 | - [pyspark's shell raw repositories](pyspark/pyspark-shell-raw-repositories.md)
20 | 
21 | ### scala
22 | 
23 | - [spark-shell basic example](scala/spark-shell-basic.md)
24 | 
25 | - [spark-shell UAST extraction](scala/spark-shell-uast-extraction.md)
26 | 
27 | - [spark-shell classifying languages](scala/spark-shell-classifying-languages.md)
28 | 
29 | - [spark-shell data schemas](scala/spark-shell-schemas.md)
30 | 
31 | - [spark-shell classifying languages and extracting UASTs](scala/spark-shell-lang-and-uast.md)
32 | 
33 | - [spark-shell querying UASTs with XPath](scala/spark-shell-xpath-query.md)
34 | 
35 | - [spark-shell raw repositories](scala/spark-shell-raw-repositories.md)
36 | 
37 | ### jupyter notebooks
38 | 
39 | - [Basic example](notebooks/Example.ipynb)
40 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-basic.md:
--------------------------------------------------------------------------------
 1 | ## Basic example
 2 | 
 3 | In this example, the pyspark-shell is used to show a simple usage of the source{d} jgit-spark-connector.
 4 | 
 5 | First, you can see how to import the package and instantiate and object that provide all the methods to manipulate the data retrieved from repositories.
 6 | 
 7 | The `engine` object is used to get all the repositories, get the `HEAD` references from the repositories and eventually, get all the blobs from these references. Then a table is showed selecting the columns `blob_id`, `path` and `content`.
 8 | 
 9 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
10 | ```sh
11 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
12 | ```
13 | 
14 | Code
15 | ```python
16 | from sourced.engine import Engine
17 | engine = Engine(spark, '/path/to/siva-files', 'siva')
18 | engine.repositories.references.head_ref.commits.tree_entries.blobs.select('blob_id', 'path', 'content').show()
19 | 
20 | ''' Output:
21 | +--------------------+--------------------+--------------------+
22 | |           blob_id  |                path|             content|
23 | +--------------------+--------------------+--------------------+
24 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[64 65 66 20 66 6...|
25 | |7268016814b8ab7bc...|          gcd/gcd.py|[69 6D 70 6F 72 7...|
26 | |25dbfff34dcc8d252...|           README.md|[23 20 66 75 6E 6...|
27 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[69 6D 70 6F 72 7...|
28 | |63bd495dce1d53092...|factorial/factori...|[69 6D 70 6F 72 7...|
29 | |bf17d9730e43f5697...|         .travis.yml|[6C 61 6E 67 75 6...|
30 | |a697a655a7bfd6ba1...|   prime/is_prime.py|[64 65 66 20 69 7...|
31 | |76052f368f4c9c8de...|pythagorean_tripl...|[66 72 6F 6D 20 7...|
32 | |3be2253ba2e871d3b...|prime/is_prime_op...|[69 6D 70 6F 72 7...|
33 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[69 6D 70 6F 72 7...|
34 | |7268016814b8ab7bc...|          gcd/gcd.py|[69 6D 70 6F 72 7...|
35 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[69 6D 70 6F 72 7...|
36 | |4d3617f27e277e4b5...|differentiation/s...|[66 72 6F 6D 20 7...|
37 | |4d3617f27e277e4b5...|differentiation/s...|[66 72 6F 6D 20 7...|
38 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[64 65 66 20 67 6...|
39 | |8ab978a56c5dcb239...|factorial/factori...|[64 65 66 20 66 6...|
40 | |e35a52f431feac4b7...|          abs/abs.py|[69 6D 70 6F 72 7...|
41 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[69 6D 70 6F 72 7...|
42 | |51bdeff4494d60bb7...|euclidean/distanc...|[69 6D 70 6F 72 7...|
43 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[64 65 66 20 67 6...|
44 | +--------------------+--------------------+--------------------+
45 | only showing top 20 rows
46 | '''
47 | ```
48 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-classifying-languages.md:
--------------------------------------------------------------------------------
 1 | ## Classifying languages example
 2 | 
 3 | This example uses the pyspark-shell to show how to classify blobs by their language with `classify_languages()`.
 4 | 
 5 | Making use of the `engine` object, it retrieves repositories to get all blobs from the `HEAD` references from them. After that, a call to `classify_languages()` function detects the language for each file to show them in the aggregated column `lang` beside the selected columns `blob_id` and `path`.
 6 | 
 7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
 8 | ```sh
 9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
10 | ```
11 | 
12 | Code:
13 | ```python
14 | from sourced.engine import Engine
15 | engine = Engine(spark, '/path/to/siva-files', 'siva')
16 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().select("blob_id", "path", "lang").show()
17 | 
18 | ''' Output:
19 | +--------------------+--------------------+--------+
20 | |             blob_id|                path|    lang|
21 | +--------------------+--------------------+--------+
22 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|  Python|
23 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|
24 | |25dbfff34dcc8d252...|           README.md|Markdown|
25 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|
26 | |63bd495dce1d53092...|factorial/factori...|  Python|
27 | |bf17d9730e43f5697...|         .travis.yml|    YAML|
28 | |a697a655a7bfd6ba1...|   prime/is_prime.py|  Python|
29 | |76052f368f4c9c8de...|pythagorean_tripl...|  Python|
30 | |3be2253ba2e871d3b...|prime/is_prime_op...|  Python|
31 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|  Python|
32 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|
33 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|  Python|
34 | |4d3617f27e277e4b5...|differentiation/s...|  Python|
35 | |4d3617f27e277e4b5...|differentiation/s...|  Python|
36 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|
37 | |8ab978a56c5dcb239...|factorial/factori...|  Python|
38 | |e35a52f431feac4b7...|          abs/abs.py|  Python|
39 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|
40 | |51bdeff4494d60bb7...|euclidean/distanc...|  Python|
41 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|
42 | +--------------------+--------------------+--------+
43 | only showing top 20 rows
44 | '''
45 | ```
46 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-lang-and-uast.md:
--------------------------------------------------------------------------------
 1 | ## Classifying languages and extracting UASTs example
 2 | 
 3 | The combined usage of both, `classify_languages()` and `extract_uasts()` methods, has the advantage that doesn't rely the language detection task on the [bblfsh server](https://github.com/bblfsh/server) , so you can save some time.
 4 | 
 5 | To do that, you just have to call  `extract_uasts()` on a Dataframe where previously, `classify_languages()` was used.
 6 | 
 7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
 8 | ```sh
 9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
10 | ```
11 | 
12 | Code:
13 | ```python
14 | from sourced.engine import Engine
15 | engine = Engine(spark, '/path/to/siva-files', 'siva')
16 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().extract_uasts().select("path", "lang", "uast").show()
17 | 
18 | ''' Output:
19 | +--------------------+--------+-------------+
20 | |                path|    lang|         uast|
21 | +--------------------+--------+-------------+
22 | |fibonacci/fibonac...|  Python|[[B@759dfd4e]|
23 | |          gcd/gcd.py|  Python| [[B@36ea40c]|
24 | |           README.md|Markdown|           []|
25 | |prime/is_prime_op...|  Python|[[B@2da632d5]|
26 | |factorial/factori...|  Python|  [[B@37e738]|
27 | |         .travis.yml|    YAML|           []|
28 | |   prime/is_prime.py|  Python|[[B@1ada1dfd]|
29 | |pythagorean_tripl...|  Python|[[B@6ce2846e]|
30 | |prime/is_prime_op...|  Python|[[B@704e33bd]|
31 | |prime/is_prime_op...|  Python|[[B@4fff14ab]|
32 | |          gcd/gcd.py|  Python| [[B@580cd5c]|
33 | |gcd/gcd_optimal_e...|  Python|[[B@7db9e876]|
34 | |differentiation/s...|  Python|[[B@7c6befa7]|
35 | |differentiation/s...|  Python|[[B@4b06f6cd]|
36 | |          gcd/gcd.py|  Python|[[B@486f38dc]|
37 | |factorial/factori...|  Python|[[B@7a2783ff]|
38 | |          abs/abs.py|  Python|[[B@59124dcb]|
39 | |prime/is_prime_op...|  Python|[[B@25de68ba]|
40 | |euclidean/distanc...|  Python|[[B@14c61d05]|
41 | |          gcd/gcd.py|  Python|[[B@52b84c19]|
42 | +--------------------+--------+-------------+
43 | only showing top 20 rows
44 | '''
45 | ```
46 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-raw-repositories.md:
--------------------------------------------------------------------------------
 1 | # Raw repositories usage
 2 | 
 3 | In this example, the pyspark-shell is used to show the usage of source{d} jgit-spark-connector with raw git repositories.
 4 | 
 5 | ## Differences with siva usage
 6 | 
 7 | What are the main differences between using the jgit-spark-connector with siva files and raw git repositories?
 8 | 
 9 | * Raw repositories can have non-remote references, siva files do not.
10 | * Even if you have only one repository, you may have N repositories in the output returned by the jgit-spark-connector. That's because different origins are treated as different repositories. In short, you'll have as many repositories as remotes in your repository plus one repository that corresponds to the local repository, which is identified by `file://$PATH_TO_REPOSITORY`. This one will always contain non-remote references and the rest of the repositories will always contain remote references.
11 | 
12 | **Note:** raw repositories refer to `standard` and `bare` repositories.
13 | 
14 | ## Getting repository references
15 | 
16 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
17 | ```sh
18 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
19 | ```
20 | 
21 | So we can get the repositories like this and we can see that even if we only have one repository, jgit-spark-connector says we have two:
22 | 
23 | ```python
24 | from sourced.engine import Engine
25 | engine = Engine(spark, '/path/to/repositories', 'standard')
26 | print(engine.repositories.count())
27 | 
28 | '''Output:
29 | 2
30 | '''
31 | ```
32 | 
33 | Getting references:
34 | 
35 | ```python
36 | print(engine.repositories.references.count())
37 | 
38 | '''Output:
39 | 4
40 | '''
41 | ```
42 | 
43 | If you want a behavior that's more similar to siva files usage you can filter out non-remote references:
44 | 
45 | ```python
46 | references = engine.repositories.references
47 | print(references.filter(references.is_remote == True).count())
48 | 
49 | '''Output:
50 | 2
51 | '''
52 | ```
53 | 
54 | Alternately, you can use the following shorthand:
55 | 
56 | ```python
57 | print(engine.repositories.remote_references.count())
58 | 
59 | '''Output:
60 | 2
61 | '''
62 | ```
63 | 
64 | ### Caveats
65 | 
66 | Note that even if in your repository there's a reference named `refs/remotes/origin/master` it will be converted to a reference named `refs/heads/master` that belongs to the repository identified by your origin remote URL.
67 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-schemas.md:
--------------------------------------------------------------------------------
  1 | ## Printing schema example
  2 | 
  3 | The next example showed,  just try to show the simple usage of the useful method `printSchema()`.
  4 | 
  5 | It can help you to follow the aggregated or pruned information that your transformations make on the data you are handling.
  6 | 
  7 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
  8 | ```sh
  9 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
 10 | ```
 11 | 
 12 | Code:
 13 | ```python
 14 | from sourced.engine import Engine
 15 | engine = Engine(spark, '/path/to/siva-files', 'siva')
 16 | 
 17 | engine.repositories.printSchema()
 18 | ''' Output:
 19 | root
 20 |  |-- id: string (nullable = false)
 21 |  |-- urls: array (nullable = false)
 22 |  |    |-- element: string (containsNull = false)
 23 |  |-- is_fork: boolean (nullable = true)
 24 | '''
 25 | 
 26 | engine.repositories.references.printSchema()
 27 | ''' Output:
 28 | root
 29 |  |-- repository_id: string (nullable = false)
 30 |  |-- name: string (nullable = false)
 31 |  |-- hash: string (nullable = false)
 32 |  |-- is_remote: boolean (nullable = false)
 33 | '''
 34 | 
 35 | engine.repositories.references.commits.printSchema()
 36 | '''
 37 | Also: engine.repositories.references.all_reference_commits.printSchema()
 38 | '''
 39 | ''' Output:
 40 | root
 41 |  |-- repository_id: string (nullable = false)
 42 |  |-- reference_name: string (nullable = false)
 43 |  |-- index: integer (nullable = false)
 44 |  |-- hash: string (nullable = false)
 45 |  |-- message: string (nullable = false)
 46 |  |-- parents: array (nullable = true)
 47 |  |    |-- element: string (containsNull = false)
 48 |  |-- parents_count: integer (nullable = false)
 49 |  |-- author_email: string (nullable = true)
 50 |  |-- author_name: string (nullable = true)
 51 |  |-- author_date: timestamp (nullable = true)
 52 |  |-- committer_email: string (nullable = true)
 53 |  |-- committer_name: string (nullable = true)
 54 |  |-- committer_date: timestamp (nullable = true)
 55 | '''
 56 | 
 57 | engine.repositories.references.commits.tree_entries.printSchema()
 58 | ''' Output:
 59 | root
 60 |  |-- commit_hash: string (nullable = false)
 61 |  |-- repository_id: string (nullable = false)
 62 |  |-- reference_name: string (nullable = false)
 63 |  |-- path: string (nullable = true)
 64 |  |-- blob: string (nullable = false)
 65 | '''
 66 | 
 67 | engine.repositories.references.commits.tree_entries.blobs.printSchema()
 68 | ''' Output:
 69 | root
 70 |  |-- blob_id: string (nullable = false)
 71 |  |-- commit_hash: string (nullable = false)
 72 |  |-- repository_id: string (nullable = false)
 73 |  |-- reference_name: string (nullable = false)
 74 |  |-- content: binary (nullable = true)
 75 |  |-- is_binary: boolean (nullable = false)
 76 |  |-- path: string (nullable = true)
 77 | '''
 78 | 
 79 | engine.repositories.references.commits.tree_entries.blobs.classify_languages().printSchema()
 80 | ''' Output:
 81 | root
 82 |  |-- blob_id: string (nullable = false)
 83 |  |-- commit_hash: string (nullable = false)
 84 |  |-- repository_id: string (nullable = false)
 85 |  |-- reference_name: string (nullable = false)
 86 |  |-- content: binary (nullable = true)
 87 |  |-- is_binary: boolean (nullable = false)
 88 |  |-- path: string (nullable = true)
 89 |  |-- lang: string (nullable = true)
 90 | '''
 91 | 
 92 | engine.repositories.references.commits.tree_entries.blobs.classify_languages().extract_uasts().printSchema()
 93 | ''' Output:
 94 | root
 95 |  |-- blob_id: string (nullable = false)
 96 |  |-- commit_hash: string (nullable = false)
 97 |  |-- repository_id: string (nullable = false)
 98 |  |-- reference_name: string (nullable = false)
 99 |  |-- content: binary (nullable = true)
100 |  |-- is_binary: boolean (nullable = false)
101 |  |-- path: string (nullable = true)
102 |  |-- lang: string (nullable = true)
103 |  |-- uast: array (nullable = true)
104 |  |    |-- element: binary (containsNull = true)
105 | '''
106 | ```
107 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-uast-extraction.md:
--------------------------------------------------------------------------------
 1 | ## Extracting UASTs example
 2 | 
 3 | In the example code below, you can take a look to how the `extract_uasts()` method works.
 4 | 
 5 | From the `engine` object instantiated in the spark-shell, a bunch of blobs are retrieving from the `HEAD` references from all the repositories and requesting for them. Once we have those blobs, we can call `extract_uasts()` which send the blobs to a [bblfsh server](https://github.com/bblfsh/server) to get back the UASTs.
 6 | 
 7 | Finally, the `blob_id` , `path` and `uast` is showed on the table.
 8 | 
 9 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
10 | ```sh
11 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
12 | ```
13 | 
14 | Code:
15 | ```python
16 | from sourced.engine import Engine
17 | engine = Engine(spark, '/path/to/siva-files', 'siva')
18 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().extract_uasts().select("blob_id", "path", "uast").show()
19 | 
20 | ''' Output:
21 | +--------------------+--------------------+-------------+
22 | |             blob_id|                path|         uast|
23 | +--------------------+--------------------+-------------+
24 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[[B@43efe672]|
25 | |7268016814b8ab7bc...|          gcd/gcd.py|[[B@66938491]|
26 | |25dbfff34dcc8d252...|           README.md|           []|
27 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@51261a61]|
28 | |63bd495dce1d53092...|factorial/factori...|[[B@3163c734]|
29 | |bf17d9730e43f5697...|         .travis.yml|           []|
30 | |a697a655a7bfd6ba1...|   prime/is_prime.py| [[B@d036b1c]|
31 | |76052f368f4c9c8de...|pythagorean_tripl...|[[B@774ec121]|
32 | |3be2253ba2e871d3b...|prime/is_prime_op...|[[B@16da28bb]|
33 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[[B@39af1733]|
34 | |7268016814b8ab7bc...|          gcd/gcd.py|[[B@2f62c091]|
35 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[[B@2e245b95]|
36 | |4d3617f27e277e4b5...|differentiation/s...|[[B@697c211a]|
37 | |4d3617f27e277e4b5...|differentiation/s...|[[B@282bb589]|
38 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[[B@11f49e55]|
39 | |8ab978a56c5dcb239...|factorial/factori...|[[B@1d80870d]|
40 | |e35a52f431feac4b7...|          abs/abs.py|[[B@157c0156]|
41 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@608e698d]|
42 | |51bdeff4494d60bb7...|euclidean/distanc...|[[B@55bd45ff]|
43 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[[B@4c1c08aa]|
44 | +--------------------+--------------------+-------------+
45 | only showing top 20 rows
46 | '''
47 | ```
48 | 


--------------------------------------------------------------------------------
/_examples/pyspark/pyspark-shell-xpath-query.md:
--------------------------------------------------------------------------------
 1 | ## Querying UASTs with XPath example
 2 | 
 3 | You can see in this example how to make queries using [XPath syntax](https://www.w3.org/TR/xpath/) to retrieve valuable information from the UASTs.
 4 | 
 5 | First we must use `extract_uasts()` method to request to a [bblfsh daemon](https://github.com/bblfsh/bblfshd) the UASTs.
 6 | 
 7 | Then we can use the method `query_uast()` to get a result for the query we are formulating requesting tokens.  This method takes in three parameters, the query, the column which contains the UASTs and the column that will be generated with the result.
 8 | 
 9 | Finally, `extract_tokens()` method will generate a column `tokens` based on the previous generated column `result`.
10 | 
11 | Launch pyspark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
12 | ```sh
13 | $ pyspark --packages "tech.sourced:jgit-spark-connector:[version]"
14 | ```
15 | 
16 | Code:
17 | ```python
18 | from sourced.engine import Engine
19 | engine = Engine(spark, '/path/to/siva-files', 'siva')
20 | 
21 | engine.repositories.references.head_ref.commits.tree_entries.blobs.classify_languages().where('lang = "Python"').extract_uasts().query_uast('//*[@roleIdentifier]').extract_tokens('result', 'tokens').select('blob_id', 'path', 'lang', 'uast', 'tokens').show()
22 | 
23 | ''' Output:
24 | +--------------------+--------------------+------+-------------+--------------------+
25 | |             blob_id|                path|  lang|         uast|              tokens|
26 | +--------------------+--------------------+------+-------------+--------------------+
27 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|Python|[[B@617b4738]|[fibonacci, n, in...|
28 | |7268016814b8ab7bc...|          gcd/gcd.py|Python|[[B@2c66d0f9]|[math, gcd, a, in...|
29 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|Python|[[B@59c072af]|[math, is_prime, ...|
30 | |63bd495dce1d53092...|factorial/factori...|Python|[[B@45b32617]|[math, factorial,...|
31 | |a697a655a7bfd6ba1...|   prime/is_prime.py|Python|[[B@7ecafb1e]|[is_prime, n, int...|
32 | |76052f368f4c9c8de...|pythagorean_tripl...|Python|[[B@64311d26]|[typing, List, ty...|
33 | |3be2253ba2e871d3b...|prime/is_prime_op...|Python|[[B@3e3e5e05]|[math, random, RA...|
34 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|Python|[[B@62e1544b]|[math, is_prime_o...|
35 | |7268016814b8ab7bc...|          gcd/gcd.py|Python|[[B@4b5a5102]|[math, gcd, a, in...|
36 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|Python|[[B@27eead62]|[math, gcd_optima...|
37 | |4d3617f27e277e4b5...|differentiation/s...|Python|[[B@6b6c11ec]|[typing, Callable...|
38 | |4d3617f27e277e4b5...|differentiation/s...|Python| [[B@3c753c6]|[typing, Callable...|
39 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|Python|[[B@1a8cd0fd]|[gcd, a, int, b, ...|
40 | |8ab978a56c5dcb239...|factorial/factori...|Python|[[B@485beb73]|[factorial, n, in...|
41 | |e35a52f431feac4b7...|          abs/abs.py|Python|[[B@43b370e5]|[math, abs, x, re...|
42 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|Python|[[B@7a534236]|[math, is_prime, ...|
43 | |51bdeff4494d60bb7...|euclidean/distanc...|Python| [[B@6246eb9]|[math, typing, Tu...|
44 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|Python|[[B@11b30d7d]|[gcd, a, int, b, ...|
45 | |e35a52f431feac4b7...|          abs/abs.py|Python|[[B@495f63f6]|[math, abs, x, re...|
46 | |8ab978a56c5dcb239...|factorial/factori...|Python|[[B@297dca19]|[factorial, n, in...|
47 | +--------------------+--------------------+------+-------------+--------------------+
48 | only showing top 20 rows
49 | '''
50 | ```
51 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-basic.md:
--------------------------------------------------------------------------------
 1 | ## Basic example
 2 | 
 3 | In this example, the spark-shell is used to show a simple usage of the source{d} jgit-spark-connector.
 4 | 
 5 | First, you can see how to import the package and instantiate and object that provide all the methods to manipulate the data retrieved from repositories.
 6 | 
 7 | The `engine` object is used to filter repositories by `id`, get the `HEAD` references from the repositories and look for the commits in that references which contain the word `Initial` in their messages. Then a table is showed selecting the columns `repository_id`, `hash` and `message`.
 8 | 
 9 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
10 | ```sh
11 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
12 | ```
13 | 
14 | Code:
15 | ```scala
16 | import tech.sourced.engine._
17 | 
18 | val engine = Engine(spark, "/path/to/siva-files", "siva")
19 | engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getReferences.filter('name === "refs/heads/HEAD").getCommits.filter('message.contains("Initial")).select('repository_id, 'hash, 'message).show
20 | 
21 | /* Output:
22 | +--------------------+--------------------+--------------+
23 | |       repository_id|                hash|       message|
24 | +--------------------+--------------------+--------------+
25 | |github.com/mingra...|aac052c42c501abf6...|Initial commit|
26 | +--------------------+--------------------+--------------+
27 | */
28 | ```
29 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-classifying-languages.md:
--------------------------------------------------------------------------------
 1 | ## Classifying languages example
 2 | 
 3 | This example uses the spark-shell to show how to classify blobs by their language with `classifyLanguages`.
 4 | 
 5 | Making use of the `engine` object, it filters repositories by `id` to get all blobs from the `HEAD` references from them. After that, a call to `classifyLanguages` function detects the language for each file to show them in the aggregated column `lang` beside the selected columns `blob_id` and `path`.
 6 | 
 7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
 8 | ```sh
 9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
10 | ```
11 | 
12 | Code:
13 | ```scala
14 | import tech.sourced.engine._
15 | 
16 | val engine = Engine(spark, "/path/to/siva-files", "siva")
17 | engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.select('blob_id, 'path, 'lang).show
18 | 
19 | /* Output:
20 | +--------------------+--------------------+--------+
21 | |             blob_id|                path|    lang|
22 | +--------------------+--------------------+--------+
23 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|  Python|
24 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|
25 | |25dbfff34dcc8d252...|           README.md|Markdown|
26 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|
27 | |63bd495dce1d53092...|factorial/factori...|  Python|
28 | |bf17d9730e43f5697...|         .travis.yml|    YAML|
29 | |a697a655a7bfd6ba1...|   prime/is_prime.py|  Python|
30 | |76052f368f4c9c8de...|pythagorean_tripl...|  Python|
31 | |3be2253ba2e871d3b...|prime/is_prime_op...|  Python|
32 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|  Python|
33 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|
34 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|  Python|
35 | |4d3617f27e277e4b5...|differentiation/s...|  Python|
36 | |4d3617f27e277e4b5...|differentiation/s...|  Python|
37 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|
38 | |8ab978a56c5dcb239...|factorial/factori...|  Python|
39 | |e35a52f431feac4b7...|          abs/abs.py|  Python|
40 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|
41 | |51bdeff4494d60bb7...|euclidean/distanc...|  Python|
42 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|
43 | +--------------------+--------------------+--------+
44 | only showing top 20 rows
45 | */
46 | ```
47 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-lang-and-uast.md:
--------------------------------------------------------------------------------
 1 | ## Classifying languages and extracting UASTs example
 2 | 
 3 | The combined usage of both, `classifyLanguages` and `extractUASTs` methods, has the advantage that doesn't rely the language detection task on the [bblfsh server](https://github.com/bblfsh/server) , so you can save some time.
 4 | 
 5 | To do that, you just have to call  `extractUASTs` on a Dataframe where previously, `classifyLanguages` was used.
 6 | 
 7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
 8 | ```sh
 9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
10 | ```
11 | 
12 | Code:
13 | ```scala
14 | import tech.sourced.engine._
15 | 
16 | val engine = Engine(spark, "/path/to/siva-files", "siva")
17 | engine.getRepositories.getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.extractUASTs.select('blob_id, 'path, 'lang, 'uast).show
18 | 
19 | /* Output:
20 | +--------------------+--------------------+--------+-------------+
21 | |             blob_id|                path|    lang|         uast|
22 | +--------------------+--------------------+--------+-------------+
23 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|  Python|[[B@62f37a44]|
24 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|[[B@7c0368da]|
25 | |25dbfff34dcc8d252...|           README.md|Markdown|           []|
26 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|[[B@7fa8bfe4]|
27 | |63bd495dce1d53092...|factorial/factori...|  Python|[[B@3cad2dd4]|
28 | |bf17d9730e43f5697...|         .travis.yml|    YAML|           []|
29 | |a697a655a7bfd6ba1...|   prime/is_prime.py|  Python|[[B@45f5415f]|
30 | |76052f368f4c9c8de...|pythagorean_tripl...|  Python|[[B@22d7a483]|
31 | |3be2253ba2e871d3b...|prime/is_prime_op...|  Python|[[B@18ba78a2]|
32 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|  Python|[[B@4dac25ec]|
33 | |7268016814b8ab7bc...|          gcd/gcd.py|  Python|[[B@223c6abf]|
34 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|  Python|[[B@3dd021c7]|
35 | |4d3617f27e277e4b5...|differentiation/s...|  Python|[[B@76e431b7]|
36 | |4d3617f27e277e4b5...|differentiation/s...|  Python|[[B@5a4bf9c2]|
37 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|[[B@1be309a6]|
38 | |8ab978a56c5dcb239...|factorial/factori...|  Python|[[B@2781dd04]|
39 | |e35a52f431feac4b7...|          abs/abs.py|  Python|[[B@70bf39ca]|
40 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|  Python|[[B@753f5bf6]|
41 | |51bdeff4494d60bb7...|euclidean/distanc...|  Python|[[B@7612c2ce]|
42 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|  Python|[[B@5f5248f5]|
43 | +--------------------+--------------------+--------+-------------+
44 | only showing top 20 rows
45 | */
46 | ```
47 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-raw-repositories.md:
--------------------------------------------------------------------------------
 1 | # Raw repositories usage
 2 | 
 3 | In this example, the spark-shell is used to show the usage of source{d} jgit-spark-connector with raw git repositories.
 4 | 
 5 | ## Differences with siva usage
 6 | 
 7 | What are the main differences between using the jgit-spark-connector with siva files and raw git repositories?
 8 | 
 9 | * Raw repositories can have non-remote references, siva files do not.
10 | * Even if you have only one repository, you may have N repositories in the output returned by the jgit-spark-connector. That's because different origins are treated as different repositories. In short, you'll have as many repositories as remotes in your repository plus one repository that corresponds to the local repository, which is identified by `file://$PATH_TO_REPOSITORY`. This one will always contain non-remote references and the rest of the repositories will always contain remote references.
11 | 
12 | **Note:** raw repositories refer to `standard` and `bare` repositories.
13 | 
14 | ## Getting repository references
15 | 
16 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
17 | ```sh
18 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
19 | ```
20 | 
21 | So we can get the repositories like this and we can see that even if we only have one repository, jgit-spark-connector says we have two:
22 | 
23 | ```scala
24 | import tech.sourced.engine._
25 | val engine = Engine(spark, "/path/to/repositories", "standard")
26 | println(engine.getRepositories.count())
27 | 
28 | // Output:
29 | // 2
30 | ```
31 | 
32 | Getting references:
33 | 
34 | ```scala
35 | print(engine.repositories.references.count())
36 | 
37 | // Output:
38 | // 4
39 | ```
40 | 
41 | If you want a behavior that's more similar to siva files usage you can filter out non-remote references:
42 | 
43 | ```scala
44 | val references = engine.getRepositories.getReferences
45 | println(references.filter(references("is_remote") === true).count())
46 | 
47 | // Output:
48 | // 2
49 | ```
50 | 
51 | Alternately, you can use the following shorthand:
52 | 
53 | ```scala
54 | print(engine.getRepositories.getRemoteReferences.count())
55 | 
56 | // Output:
57 | // 2
58 | ```
59 | 
60 | ### Caveats
61 | 
62 | Note that even if in your repository there's a reference named `refs/remotes/origin/master` it will be converted to a reference named `refs/heads/master` that belongs to the repository identified by your origin remote URL.
63 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-schemas.md:
--------------------------------------------------------------------------------
  1 | ## Printing schema example
  2 | 
  3 | The next example showed,  just try to show the simple usage of the useful method `printSchema`.
  4 | 
  5 | It can help you to follow the aggregated or pruned information that your transformations make on the data you are handling.
  6 | 
  7 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
  8 | ```sh
  9 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
 10 | ```
 11 | 
 12 | Code:
 13 | ```scala
 14 | import tech.sourced.engine._
 15 | 
 16 | val engine = Engine(spark, "/path/to/siva-files", "siva")
 17 | engine.getRepositories.printSchema
 18 | /* Output:
 19 | root
 20 |  |-- id: string (nullable = false)
 21 |  |-- urls: array (nullable = false)
 22 |  |    |-- element: string (containsNull = false)
 23 |  |-- is_fork: boolean (nullable = true)
 24 | */
 25 | 
 26 | engine.getRepositories.getReferences.printSchema
 27 | /* Output:
 28 | root
 29 |  |-- repository_id: string (nullable = false)
 30 |  |-- name: string (nullable = false)
 31 |  |-- hash: string (nullable = false)
 32 |  |-- is_remote: boolean (nullable = false)
 33 | */
 34 | 
 35 | engine.getRepositories.getReferences.getCommits.printSchema
 36 | /* also engine.getRepositories.getReferences.getAllReferenceCommits.printSchema */
 37 | /* Output:
 38 | root
 39 |  |-- repository_id: string (nullable = false)
 40 |  |-- reference_name: string (nullable = false)
 41 |  |-- index: integer (nullable = false)
 42 |  |-- hash: string (nullable = false)
 43 |  |-- message: string (nullable = false)
 44 |  |-- parents: array (nullable = true)
 45 |  |    |-- element: string (containsNull = false)
 46 |  |-- parents_count: integer (nullable = false)
 47 |  |-- author_email: string (nullable = true)
 48 |  |-- author_name: string (nullable = true)
 49 |  |-- author_date: timestamp (nullable = true)
 50 |  |-- committer_email: string (nullable = true)
 51 |  |-- committer_name: string (nullable = true)
 52 |  |-- committer_date: timestamp (nullable = true)
 53 | */
 54 | 
 55 | engine.getRepositories.getReferences.getCommits.getTreeEntries.printSchema
 56 | /* Output:
 57 | root
 58 |  |-- commit_hash: string (nullable = false)
 59 |  |-- repository_id: string (nullable = false)
 60 |  |-- reference_name: string (nullable = false)
 61 |  |-- blob: string (nullable = true)
 62 | */
 63 | 
 64 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.printSchema
 65 | /* Output:
 66 | root
 67 |  |-- blob_id: string (nullable = false)
 68 |  |-- commit_hash: string (nullable = false)
 69 |  |-- repository_id: string (nullable = false)
 70 |  |-- reference_name: string (nullable = false)
 71 |  |-- content: binary (nullable = true)
 72 |  |-- is_binary: boolean (nullable = false)
 73 |  |-- path: string (nullable = true)
 74 | */
 75 | 
 76 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.classifyLanguages.printSchema
 77 | /* Output:
 78 | root
 79 |  |-- blob_id: string (nullable = false)
 80 |  |-- commit_hash: string (nullable = false)
 81 |  |-- repository_id: string (nullable = false)
 82 |  |-- reference_name: string (nullable = false)
 83 |  |-- content: binary (nullable = true)
 84 |  |-- is_binary: boolean (nullable = false)
 85 |  |-- path: string (nullable = true)
 86 |  |-- lang: string (nullable = true)
 87 | */
 88 | 
 89 | engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs.classifyLanguages.extractUASTs.printSchema
 90 | /* Output:
 91 | root
 92 |  |-- blob_id: string (nullable = false)
 93 |  |-- commit_hash: string (nullable = false)
 94 |  |-- repository_id: string (nullable = false)
 95 |  |-- reference_name: string (nullable = false)
 96 |  |-- content: binary (nullable = true)
 97 |  |-- is_binary: boolean (nullable = false)
 98 |  |-- path: string (nullable = true)
 99 |  |-- lang: string (nullable = true)
100 |  |-- uast: array (nullable = true)
101 |  |    |-- element: binary (containsNull = true)
102 | */
103 | ```
104 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-uast-extraction.md:
--------------------------------------------------------------------------------
 1 | ## Extracting UASTs example
 2 | 
 3 | In the example code below, you can take a look to how the `extractUASTs` method works.
 4 | 
 5 | From the `engine` object instantiated in the spark-shell, a bunch of blobs has been got filtering repositories by `id`, retrieving their `HEAD` references and requesting for them. Once we have that blobs, we can call `extractUASTs` which send the blobs to a [bblfsh server](https://github.com/bblfsh/server) to get back the UASTs.
 6 | 
 7 | Finally, the `blob_id`, file `path` and `uast` is showed on the table.
 8 | 
 9 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
10 | ```sh
11 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
12 | ```
13 | 
14 | ```scala
15 | import tech.sourced.engine._
16 | 
17 | val engine = Engine(spark, "/path/to/siva-files", "siva")
18 | val exampleDf = engine.getRepositories.filter('id === "github.com/mingrammer/funmath.git").getHEAD.getCommits.getTreeEntries.getBlobs.extractUASTs.select('blob_id, 'path, 'uast)
19 | 
20 | exampleDf.show
21 | 
22 | /* Output:
23 | +--------------------+--------------------+-------------+
24 | |             blob_id|                path|         uast|
25 | +--------------------+--------------------+-------------+
26 | |ff4fa0794274a7ffb...|fibonacci/fibonac...|[[B@5e53daf6]|
27 | |7268016814b8ab7bc...|          gcd/gcd.py|[[B@65f08242]|
28 | |25dbfff34dcc8d252...|           README.md|           []|
29 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@7d81ce6a]|
30 | |63bd495dce1d53092...|factorial/factori...|[[B@4c903df9]|
31 | |bf17d9730e43f5697...|         .travis.yml|           []|
32 | |a697a655a7bfd6ba1...|   prime/is_prime.py| [[B@cd4caf7]|
33 | |76052f368f4c9c8de...|pythagorean_tripl...|[[B@6d57bbbd]|
34 | |3be2253ba2e871d3b...|prime/is_prime_op...|[[B@1ed6dae3]|
35 | |1ec7f95f8be7bf4f3...|prime/is_prime_op...|[[B@53e45335]|
36 | |7268016814b8ab7bc...|          gcd/gcd.py|[[B@79cda8cc]|
37 | |793b6e21f2eebe900...|gcd/gcd_optimal_e...|[[B@29976e1b]|
38 | |4d3617f27e277e4b5...|differentiation/s...| [[B@13ea808]|
39 | |4d3617f27e277e4b5...|differentiation/s...|[[B@70323ee1]|
40 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[[B@642d63e3]|
41 | |8ab978a56c5dcb239...|factorial/factori...|[[B@76583ecb]|
42 | |e35a52f431feac4b7...|          abs/abs.py| [[B@252b6e0]|
43 | |b2675a52ed6bfdfa9...|prime/is_prime_op...|[[B@63f6557d]|
44 | |51bdeff4494d60bb7...|euclidean/distanc...|[[B@6ccb009b]|
45 | |6d7c6cb29abb52fc2...|          gcd/gcd.py|[[B@5b52d5af]|
46 | +--------------------+--------------------+-------------+
47 | only showing top 20 rows
48 | */
49 | ```
50 | 


--------------------------------------------------------------------------------
/_examples/scala/spark-shell-xpath-query.md:
--------------------------------------------------------------------------------
 1 | ## Querying UASTs with XPath example
 2 | 
 3 | You can see in this example how to make queries using [XPath syntax](https://www.w3.org/TR/xpath/) to retrieve valuable information from the UASTs.
 4 | 
 5 | First we must use `extractUASTs` method to request to a [bblfsh daemon](https://github.com/bblfsh/bblfshd) the UASTs.
 6 | 
 7 | Then we can use the method `queryUAST` to get a result for the query we are formulating requesting tokens.  This method takes in three parameters, the query, the column which contains the UASTs and the column that will be generated with the result.
 8 | 
 9 | Finally, `extractTokens` method will generate a column `tokens` based on the previous generated column `result`.
10 | 
11 | Launch spark-shell, replacing `[version]` with the [latest jgit-spark-connector version](http://search.maven.org/#search%7Cga%7C1%7Ctech.sourced):
12 | ```sh
13 | $ spark-shell --packages "tech.sourced:jgit-spark-connector:[version]"
14 | ```
15 | 
16 | Code:
17 | ```scala
18 | import tech.sourced.engine._
19 | 
20 | val engine = Engine(spark, "/path/to/siva-files", "siva")
21 | engine.getRepositories.getHEAD.getCommits.getTreeEntries.getBlobs.classifyLanguages.where('lang === "Python").extractUASTs.queryUAST("//*[@roleIdentifier]", "uast", "result").extractTokens("result", "tokens").select('path, 'lang, 'uast, 'tokens).show
22 | 
23 | /* Output:
24 | +--------------------+------+-------------+--------------------+
25 | |                path|  lang|         uast|              tokens|
26 | +--------------------+------+-------------+--------------------+
27 | |fibonacci/fibonac...|Python|[[B@466c4700]|[fibonacci, n, in...|
28 | |          gcd/gcd.py|Python|[[B@22a4508c]|[math, gcd, a, in...|
29 | |prime/is_prime_op...|Python|[[B@6772d8f3]|[math, is_prime, ...|
30 | |factorial/factori...|Python| [[B@86bff75]|[math, factorial,...|
31 | |   prime/is_prime.py|Python|[[B@2c1bed3f]|[is_prime, n, int...|
32 | |pythagorean_tripl...|Python|[[B@2cbbf800]|[typing, List, ty...|
33 | |prime/is_prime_op...|Python|[[B@5d7f1824]|[math, random, RA...|
34 | |prime/is_prime_op...|Python| [[B@ab8c4a9]|[math, is_prime_o...|
35 | |          gcd/gcd.py|Python|[[B@7939b2d4]|[math, gcd, a, in...|
36 | |gcd/gcd_optimal_e...|Python| [[B@a313e0b]|[math, gcd_optima...|
37 | |differentiation/s...|Python|[[B@2faab951]|[typing, Callable...|
38 | |differentiation/s...|Python|[[B@637bad81]|[typing, Callable...|
39 | |          gcd/gcd.py|Python|[[B@57601c28]|[gcd, a, int, b, ...|
40 | |factorial/factori...|Python|[[B@5422a1a9]|[factorial, n, in...|
41 | |          abs/abs.py|Python|[[B@2e38fa4d]|[math, abs, x, re...|
42 | |prime/is_prime_op...|Python|[[B@10914dae]|[math, is_prime, ...|
43 | |euclidean/distanc...|Python|[[B@47c782c8]|[math, typing, Tu...|
44 | |          gcd/gcd.py|Python| [[B@6a94c70]|[gcd, a, int, b, ...|
45 | |          abs/abs.py|Python|[[B@6faa347a]|[math, abs, x, re...|
46 | |factorial/factori...|Python|[[B@754ce81c]|[factorial, n, in...|
47 | +--------------------+------+-------------+--------------------+
48 | only showing top 20 rows
49 | */
50 | ```
51 | 


--------------------------------------------------------------------------------
/_examples/siva-files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva


--------------------------------------------------------------------------------
/_examples/siva-files/5d4a8bf30c0da7209f651632b62a362620556c85.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/5d4a8bf30c0da7209f651632b62a362620556c85.siva


--------------------------------------------------------------------------------
/_examples/siva-files/aac052c42c501abf6aa8c3509424e837bb27e188.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/_examples/siva-files/aac052c42c501abf6aa8c3509424e837bb27e188.siva


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | import java.nio.file.{Files, StandardCopyOption}
  2 | 
  3 | import Dependencies.{scalaTest, _}
  4 | import sbt.Keys.libraryDependencies
  5 | 
  6 | organization := "tech.sourced"
  7 | scalaVersion := "2.11.11"
  8 | name := "jgit-spark-connector"
  9 | 
 10 | git.useGitDescribe := true
 11 | enablePlugins(GitVersioning)
 12 | 
 13 | libraryDependencies += scalaTest % Test
 14 | libraryDependencies += scoverage % Test
 15 | libraryDependencies += sparkSql % Provided
 16 | libraryDependencies += newerHadoopClient % Provided //due to newer v. of guava in bblfsh
 17 | // grpc for bblfsh/client-scala needs to be newer than in Spark
 18 | libraryDependencies += fixNettyForGrpc
 19 | libraryDependencies += jgit % Compile
 20 | libraryDependencies += siva % Compile
 21 | libraryDependencies += bblfsh % Compile
 22 | libraryDependencies += commonsIO % Compile
 23 | libraryDependencies += commonsPool % Compile
 24 | libraryDependencies += enry % Compile
 25 | libraryDependencies += scalaLib % Provided
 26 | libraryDependencies += sqlite % Compile
 27 | libraryDependencies += sqlite % Test
 28 | libraryDependencies += metrics % Compile
 29 | 
 30 | testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest, "-oUT")
 31 | 
 32 | test in assembly := {}
 33 | assemblyJarName in assembly := s"${name.value}-${version.value}.jar"
 34 | 
 35 | parallelExecution in Test := false
 36 | logBuffered in Test := false
 37 | 
 38 | // Shade everything but tech.sourced.engine so the user does not have conflicts
 39 | assemblyShadeRules in assembly := Seq(
 40 |   ShadeRule.rename("com.google.common.**" ->
 41 |     "tech.sourced.engine.shaded.com.google.common.@1").inAll,
 42 |   ShadeRule.rename("com.google.protobuf.**" ->
 43 |     "tech.sourced.engine.shaded.com.google.protobuf.@1").inAll,
 44 |   ShadeRule.rename("io.netty.**" ->
 45 |     "tech.sourced.engine.shaded.io.netty.@1").inAll
 46 | )
 47 | 
 48 | assemblyMergeStrategy in assembly := {
 49 |   case "META-INF/io.netty.versions.properties" => MergeStrategy.last
 50 |   case x =>
 51 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
 52 |     oldStrategy(x)
 53 | }
 54 | 
 55 | sonatypeProfileName := "tech.sourced"
 56 | 
 57 | // pom settings for sonatype
 58 | homepage := Some(url("https://github.com/src-d/jgit-spark-connector"))
 59 | scmInfo := Some(ScmInfo(url("https://github.com/src-d/jgit-spark-connector"),
 60 |   "git@github.com:src-d/jgit-spark-connector.git"))
 61 | developers += Developer("ajnavarro",
 62 |   "Antonio Navarro",
 63 |   "antonio@sourced.tech",
 64 |   url("https://github.com/ajnavarro"))
 65 | developers += Developer("bzz",
 66 |   "Alexander Bezzubov",
 67 |   "alex@sourced.tech",
 68 |   url("https://github.com/bzz"))
 69 | developers += Developer("mcarmonaa",
 70 |   "Manuel Carmona",
 71 |   "manuel@sourced.tech",
 72 |   url("https://github.com/mcarmonaa"))
 73 | developers += Developer("erizocosmico",
 74 |   "Miguel Molina",
 75 |   "miguel@sourced.tech",
 76 |   url("https://github.com/erizocosmico"))
 77 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0"))
 78 | pomIncludeRepository := (_ => false)
 79 | 
 80 | crossPaths := false
 81 | publishMavenStyle := true
 82 | 
 83 | val SONATYPE_USERNAME = scala.util.Properties.envOrElse("SONATYPE_USERNAME", "NOT_SET")
 84 | val SONATYPE_PASSWORD = scala.util.Properties.envOrElse("SONATYPE_PASSWORD", "NOT_SET")
 85 | credentials += Credentials(
 86 |   "Sonatype Nexus Repository Manager",
 87 |   "oss.sonatype.org",
 88 |   SONATYPE_USERNAME,
 89 |   SONATYPE_PASSWORD)
 90 | 
 91 | val SONATYPE_PASSPHRASE = scala.util.Properties.envOrElse("SONATYPE_PASSPHRASE", "not set")
 92 | 
 93 | useGpg := false
 94 | pgpSecretRing := baseDirectory.value / "project" / ".gnupg" / "secring.gpg"
 95 | pgpPublicRing := baseDirectory.value / "project" / ".gnupg" / "pubring.gpg"
 96 | pgpPassphrase := Some(SONATYPE_PASSPHRASE.toArray)
 97 | 
 98 | packageBin in Compile := {
 99 |   val file = (packageBin in Compile).value
100 |   val dest = new java.io.File(file.getParent, s"${name.value}-${version.value}-slim.jar")
101 |   Files.copy(
102 |     new java.io.File(file.getAbsolutePath).toPath,
103 |     dest.toPath,
104 |     StandardCopyOption.REPLACE_EXISTING
105 |   )
106 |   Files.delete(file.toPath)
107 |   dest
108 | }
109 | 
110 | publishArtifact in (Compile, packageBin) := false
111 | 
112 | val packageSlim = taskKey[File]("package-slim")
113 | 
114 | packageSlim := (packageBin in Compile).value
115 | 
116 | addArtifact(Artifact("jgit-spark-connector", "jar", "jar", "slim"), packageSlim)
117 | 
118 | assembly := {
119 |   val file = assembly.value
120 |   val dest = new java.io.File(file.getParent, s"${name.value}-uber.jar")
121 |   Files.copy(
122 |     new java.io.File(file.getAbsolutePath).toPath,
123 |     dest.toPath,
124 |     StandardCopyOption.REPLACE_EXISTING
125 |   )
126 |   file
127 | }
128 | 
129 | assembly := assembly.dependsOn(packageBin in Compile).value
130 | 
131 | addArtifact(artifact in(Compile, assembly), assembly)
132 | 
133 | isSnapshot := version.value endsWith "SNAPSHOT"
134 | 
135 | publishTo := {
136 |   val nexus = "https://oss.sonatype.org/"
137 |   if (isSnapshot.value) {
138 |     Some("snapshots" at nexus + "content/repositories/snapshots")
139 |   } else {
140 |     Some("releases" at nexus + "service/local/staging/deploy/maven2")
141 |   }
142 | }
143 | 


--------------------------------------------------------------------------------
/documentation/proposals/ENIP-000.md:
--------------------------------------------------------------------------------
 1 | ENIP Template
 2 | 
 3 | | Field | Value |
 4 | | --- | --- |
 5 | | ENIP | 0 |
 6 | | Title | ENIP Template |
 7 | | Author | Antonio Navarro |
 8 | | Status | Accepted |
 9 | | Created | 2017-05-26 |
10 | | Updated | 2017-09-13 |
11 | | Target version | optional |
12 | 
13 | ## Abstract
14 | 
15 | A short description of the technical issue being addressed.
16 | 
17 | ## Rationale
18 | 
19 | Proposal motivation.
20 | 
21 | ## Specification
22 | 
23 | Technical specification of the changes proposed.
24 | 
25 | ## Alternatives
26 | 
27 | How the issue is currently solved or can be solved if this change is not accepted.
28 | 
29 | ## Impact
30 | 
31 | How this change would impact jgit-spark-connector functionality: backward compatibility broken, performance improvements or issues, corner cases and so on.
32 | 
33 | ## References
34 | 
35 | Links to additional documentation describing related features or other kind of related information.
36 | 


--------------------------------------------------------------------------------
/documentation/proposals/ENIP-001.md:
--------------------------------------------------------------------------------
  1 | # Pre-compute repository metadata and save it to another DataSource
  2 | 
  3 | | Field | Value |
  4 | | --- | --- |
  5 | | ENIP | 1 |
  6 | | Title | Pre-compute repository metadata and save it to another DataSource |
  7 | | Author | Antonio Navarro |
  8 | | Status | Rejected |
  9 | | Created | 2017-11-14 |
 10 | | Updated | 2017-11-21 |
 11 | | Target version | - |
 12 | 
 13 | ## Abstract
 14 | 
 15 | With this change we want to improve the performance of reading repositories metadata,
 16 | saving that metadata in other DataSource than GitDataSource.
 17 | It can be any of the already implemented ones (json,parquet,jdbc and so on).
 18 | 
 19 | ## Rationale
 20 | 
 21 | Reading the content of siva files over and over again is not really performant.
 22 | With this ENIP we want a way to improve speed reading metadata (repositories, references, commits, and tree entries).
 23 | 
 24 | To do that,
 25 | we are going to add new methods on the api using the already existing methods on DataFrame API,
 26 | [reader][1] and [writer][2].
 27 | 
 28 | ## Specification
 29 | To be able to register other datasource than GitDataSource, we should change a bit the way that we are geting the datasources to process commits, references, or blobs.
 30 | 
 31 | Actually we are registering datasources using `getDatasource` method:
 32 | 
 33 | ```scala
 34 | /**
 35 |     * Returns a [[org.apache.spark.sql.DataFrame]] for the given table using the provided
 36 |     * [[org.apache.spark.sql.SparkSession]].
 37 |     *
 38 |     * @param table   name of the table
 39 |     * @param session spark session
 40 |     * @return dataframe for the given table
 41 |     */
 42 |   private[engine] def getDataSource(table: String, session: SparkSession): DataFrame =
 43 |     session.read.format("tech.sourced.engine.DefaultSource")
 44 |       .option("table", table)
 45 |       .load(session.sqlContext.getConf(repositoriesPathKey))
 46 | ```
 47 | 
 48 | Instead of this, we can create a view using the SparkSession from several datasources:
 49 | 
 50 | ```scala
 51 | /**
 52 |   * Creates a local temporary view using the given name. The lifetime of this
 53 |   * temporary view is tied to the [[SparkSession]] that was used to create this Dataset.
 54 |   *
 55 |   * @group basic
 56 |   * @since 2.0.0
 57 |   */
 58 |  def createOrReplaceTempView(viewName: String): Unit = withPlan {
 59 |    createTempViewCommand(viewName, replace = true, global = false)
 60 |  }
 61 | ```
 62 | 
 63 | `createOrReplaceTempView` method will allow us to register tables at the engine instantiation with several datasources.
 64 | Then, from implicit DataFrame methods, we can do:
 65 | 
 66 | ```scala
 67 | val commitsDf = df.sparkSession.table("commits")
 68 | ```
 69 | 
 70 | Instead of:
 71 | ```scala
 72 | val commitsDf = getDataSource("commits", df.sparkSession)
 73 | ```
 74 | 
 75 | Then, the list of needed changes on the Engine API are:
 76 | - Initialize GitDataSource views at Engine initialization
 77 | - Add method `backMetadataToSource(options)` (name to decide) into the Engine API.
 78 | - Add method `fromMetadataSource(options)` (name to decide) into the Engine API.
 79 | That method will change all the default registered views to the specified DataSource.
 80 | 
 81 | We should check speed improvement with a substantial amount of repositories and several DataSources.
 82 | 
 83 | ## Alternatives
 84 | Using the already existing Spark Dataframe API,
 85 | we can save that metadata.
 86 | 
 87 | Example:
 88 | ```scala
 89 | repositoriesDf.write.bucketBy(100,"repository_url").parquet("repositories.parquet")
 90 | // or
 91 | repositoriesDf.write.jdbc(url, tableName, properties)
 92 | 
 93 | ```
 94 | 
 95 | And then read it using `SparkSession.read` method.
 96 | 
 97 | ## Impact
 98 | 
 99 | The actual Join Rule optimization for Git Datasources will not be applied.
100 | That means,
101 | if we do a Join between two jdbc datasources table,
102 | the Join will be executed at Spark level,
103 | doing a full scan on both jdbc tables.
104 | That can works really well with small amount of repositories and siva files,
105 | but if we want an Engine as scalable as Spark, we should avoid this kind of operations.
106 | 
107 | ## References
108 | 
109 | [DataFrameReader API][1]
110 | 
111 | [DataFrameWriter API][2]
112 | 
113 | [1]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrameReader
114 | [2]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrameWriter
115 | 


--------------------------------------------------------------------------------
/documentation/proposals/ENIP-002.md:
--------------------------------------------------------------------------------
 1 | # Use Borges DB as metadata on Engine instead of config files inside siva files.
 2 | 
 3 | | Field | Value |
 4 | | --- | --- |
 5 | | ENIP | 2 |
 6 | | Title | Use Borges DB as metadata on Engine instead of config files inside siva files. |
 7 | | Author | Antonio Navarro |
 8 | | Status | Rejected |
 9 | | Created | 2017-11-15 |
10 | | Updated | 2017-11-21 |
11 | | Target version | 0.X |
12 | 
13 | ## Abstract
14 | 
15 | In our current pipeline,
16 | the element that is in charge of fetch and organize repositories over root-repositories ([Borges][2]) is already creating a database with a lot of useful metadata about the repositories.
17 | With this ENIP we want to use that metadata to improve Engine performance.
18 | 
19 | ## Rationale
20 | 
21 | Borges is creating a new row per repository fetched.
22 | The actual schema is:
23 | 
24 | |Column|Type|Description|
25 | |---|---|---|
26 | |ID|ULID|Unique ULID for a repository |
27 | |CreatedAt|Timestamp||
28 | |UpdatedAt|Timestamp||
29 | |Endpoints|Array[String]|Endpoints is a slice of valid git endpoints to reach this repository. For example, git://host/my/repo.git and https://host/my/repo.git. They are meant to be endpoints of the same exact repository, and not mirrors.|
30 | |FetchStatus|String|Actual status of the repository, it can be "not_found", "fetched", "pending" and "fetching"|
31 | |FetchedAt|Timestamp|FetchedAt is the timestamp of the last time this repository was fetched and archived in our repository storage successfully.|
32 | |FetchedErrorAt|Timestamp|FetchErrorAt is the timestamp of the last fetch error, if any.|
33 | |LastCommitAt|Timestamp|LastCommitAt is the last commit time found in this repository.|
34 | |References|JsonB|References is the current slice of references as present in our repository storage.|
35 | |IsFork|Boolean|IsFork stores if this repository is a fork or not. It can be nil if we don't know.|
36 | 
37 | The content of the References Json is:
38 | 
39 | |Column|Type|Description|
40 | |---|---|---|
41 | |Name|String|Name is the full reference name.|
42 | |Hash|Array[Byte]|Hash is the hash of the reference.|
43 | |Init|Array[Byte]|Init is the hash of the init commit reached from this reference.|
44 | |Roots|Array[Array[Byte]]|Roots is a slice of the hashes of all root commits reachable from this reference.|
45 | |Time|Timestamp|Time is the time of the commit this reference points too.|
46 | 
47 | The JDBC connector returns [json and jsonb types as String][1],
48 | so we should apply a defined function to parse as a StructType to be able to query internal content.
49 | 
50 | ## Specification
51 | 
52 | Create new method on the Engine that able us to register the "repositories" and "references" tables as views from a JDBC datasource.
53 | We should call that methods something related with Borges because this functionality is heavily sticked to it.
54 | Example: `fromBorgesMetadata(options)`
55 | 
56 | Because of "references" is one of the columns of the "repositories" table,
57 | we should create a view from a query that applies the `from_json()` function,
58 | and expand the result to make a new table with all the reference elements of the arrays.
59 | 
60 | As first approach, the actual table schemas will be preserved and data mapped to that schema.
61 | Actually, the schema can be modified if the main columns,
62 | the ones used to join data between tables,
63 | are preserved.
64 | 
65 | The "repositories" view will filter all repositories that are not in *fetched* status to avoid consistency problems with the existing rooted-repositories.
66 | 
67 | We also need to check if the names for special references (HEAD and master) are specified at the same way as they are specified into rooted-repositories.
68 | 
69 | The actual logic to generate the repository id on the Engine will be reused to get that data from Borges table.
70 | 
71 | ## Alternatives
72 | 
73 | The actual Spark API allows us to create a DataFrame from any JDBC connection.
74 | 
75 | ## Impact
76 | 
77 | New method on the Engine to use Borges views instead of standard ones.
78 | 
79 | ## References
80 | - [Borges][2]
81 | - [json and jsonb types are returned as String][1]
82 | 
83 | [1]: https://github.com/apache/spark/blob/0c0ad436ad909364915b910867d08262c62bc95d/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L58
84 | [2]: https://github.com/src-d/borges
85 | 


--------------------------------------------------------------------------------
/documentation/proposals/ENIP-004.md:
--------------------------------------------------------------------------------
 1 | | Field | Value |
 2 | | --- | --- |
 3 | | ENIP | 4 |
 4 | | Title | Get only first reference commit by default |
 5 | | Author | Miguel Molina |
 6 | | Status | Accepted |
 7 | | Created | 2018-01-10 |
 8 | | Updated | 2018-01-10 |
 9 | | Target version | `0.4.x` |
10 | 
11 | ## Abstract
12 | 
13 | The purpose of this proposal is to make the default behavior of the engine to only get the first reference commit (aka the current state of that reference) and only get all reference commits when explicitly asked using a method `getAllReferenceCommits`.
14 | 
15 | ## Rationale
16 | 
17 | The rationale behind this is that most of the time what you want is `getFirstReferenceCommit`, which is the obvious thing a person would expect engine to do. If you want the more detailed output provided by `getAllReferenceCommits` you are opting in into this, which you already know will severely harm the performance of the job you're running.
18 | 
19 | So, this as a default that makes more sense for the following reasons:
20 | 
21 | * More obvious behavior for newcomers.
22 | * Default that does not severely impact performance.
23 | * Previous effect can still be achieved.
24 | 
25 | ## Specification
26 | 
27 | While this may seem like a very simple and easy issue, it is not as simple once one takes a deep look at how the engine queries are built. Right now, `getFirstReferenceCommit` adds a simple `index = 0` filter, but that's the opt-in behavior. If we want to make it the default, we need to make this change at the iterator level and not adding filters on the query, because then `getAllReferenceCommits` would not be able to remove the filter node.
28 | 
29 | - `CommitIterator` needs to be changed to just get the first commit if no `index` filter is provided or and get N commits if it is.
30 | - `getAllReferenceCommits` can't provide a single number to match, like `index = 0`, it would need to do the following: `index >= 0`, which would require the `EqualThanOrEqual` filter.
31 | - Implement `EqualThanOrEqual` filter, which requires a complete refactor of the filters, which now work only for equality, providing a list of values that match the given filter instead of returning a function that can be evaluated inside the iterators.
32 | 
33 | 
34 | ## Alternatives
35 | 
36 | As an alternative, instead of `index >= 0` one could use `index <> -1`, which would work with the current filters implementation, thus making the implementation of this proposal easier, although hackier.
37 | 
38 | I would suggest that the first approach is taken, so that we can pave the road to include all possible filter nodes, which would make our iterators more efficient in filtering data without relying on spark to do that.
39 | 
40 | ## Impact
41 | 
42 | This change breaks compatibility with all prior versions because it changes the output produced by the engine, so it should be released as a `0.4.x` version.
43 | 
44 | ## References
45 | 
46 | n/a
47 | 


--------------------------------------------------------------------------------
/documentation/proposals/README.md:
--------------------------------------------------------------------------------
 1 | # Engine Improvement Proposals
 2 | 
 3 | ## Introduction
 4 | 
 5 | This is the index of Engine Improvement Proposals, known as ENIPs.
 6 | 
 7 | ## All Proposals by Number
 8 | 
 9 | | Number | Status   | Title                                                                |
10 | | ------ | -------- |----------------------------------------------------------------------|
11 | | 0      | Accepted | [ENIP Template](ENIP-000.md)|
12 | | 1      | Rejected | [Pre-compute repository metadata and save it to another DataSource](ENIP-001.md)|
13 | | 2      | Rejected | [Use Borges DB as metadata on Engine instead of config files inside siva files.](ENIP-002.md)|
14 | | 3      | Draft | [Local SQLite database per worker for on-demand metadata storage](ENIP-003.md)|
15 | 


--------------------------------------------------------------------------------
/key.asc.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/key.asc.enc


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | object Dependencies {
 4 |   lazy val sparkVersion: String = sys.props.get("spark.version")
 5 |     .getOrElse("2.2.1")
 6 | 
 7 |   lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.1"
 8 |   lazy val scoverage = "org.scoverage" %% "scalac-scoverage-plugin" % "1.3.1"
 9 |   lazy val sparkSql = "org.apache.spark" %% "spark-sql" % sparkVersion
10 |   lazy val newerHadoopClient = "org.apache.hadoop" % "hadoop-client" % "2.7.2"
11 |   lazy val fixNettyForGrpc = "io.netty" % "netty-all" % "4.1.17.Final"
12 |   lazy val jgit = "org.eclipse.jgit" % "org.eclipse.jgit" % "4.9.0.201710071750-r"
13 |   lazy val siva = "tech.sourced" % "siva-java" % "0.1.3"
14 |   lazy val bblfsh = "org.bblfsh" % "bblfsh-client" % "1.9.1"
15 |   lazy val enry = "tech.sourced" % "enry-java" % "1.6.3"
16 |   lazy val commonsIO = "commons-io" % "commons-io" % "2.5"
17 |   lazy val commonsPool = "org.apache.commons" % "commons-pool2" % "2.4.3"
18 |   lazy val scalaLib = "org.scala-lang" % "scala-library" % "2.11.11"
19 |   lazy val sqlite = "org.xerial" % "sqlite-jdbc" % "3.21.0"
20 |   lazy val metrics = "com.groupon.dse" % "spark-metrics" % "2.0.0"
21 | }
22 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.13
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
4 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.2")
6 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1")
8 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.9.3")
9 | 


--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.egg-info
3 | dist
4 | build
5 | *.log
6 | jars
7 | *.pyc
8 | metastore_db
9 | *.zip


--------------------------------------------------------------------------------
/python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md version.txt
2 | recursive-include jars *.jar
3 | 


--------------------------------------------------------------------------------
/python/Makefile:
--------------------------------------------------------------------------------
 1 | ENGINE_UBER_JAR = jgit-spark-connector-uber.jar
 2 | ENGINE_UBER_JAR_LOCATION = ../target/$(ENGINE_UBER_JAR)
 3 | JARS_DIR = jars
 4 | 
 5 | 
 6 | $(JARS_DIR):
 7 | 	mkdir -p $(JARS_DIR)
 8 | 
 9 | .PHONY: test clean
10 | test: clean $(JARS_DIR)
11 | 	cp $(ENGINE_UBER_JAR_LOCATION) $(JARS_DIR) && \
12 | 	python -m unittest discover -v -s ./test -t .
13 | 
14 | clean:
15 | 	if [ -d $(JARS_DIR) ] ; \
16 | 	then \
17 | 		rm -r $(JARS_DIR) ; \
18 | 	fi
19 | 


--------------------------------------------------------------------------------
/python/README.rst:
--------------------------------------------------------------------------------
1 | jgit-spark-connector
2 | ----------------------------
3 | 
4 | Python wrapper of the jgit-spark-connector to perform analysis on top of source code.


--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | from setuptools import setup
 3 | import os
 4 | from os.path import exists, join, dirname, realpath
 5 | 
 6 | CURR_DIR = dirname(realpath(__file__))
 7 | VERSION_FILE = join(CURR_DIR, "version.txt")
 8 | README_FILE = join(CURR_DIR, "README.rst")
 9 | 
10 | if exists(VERSION_FILE):
11 |     with open(VERSION_FILE, 'r') as f:
12 |         __version__ = f.read().strip()
13 | else:
14 |     __version__ = 'local'
15 | 
16 | with open(README_FILE, 'r') as f:
17 |     README = f.read()
18 | 
19 | setup(
20 |     name="sourced-jgit-spark-connector",
21 |     description="Engine to use Spark on top of source code repositories.",
22 |     long_description=README,
23 |     version=__version__,
24 |     license="Apache-2.0",
25 |     author="source{d}",
26 |     author_email="hello@sourced.tech",
27 |     url="https://github.com/src-d/jgit-spark-connector/tree/master/python",
28 |     packages=['sourced.engine'],
29 |     namespace_packages=['sourced'],
30 |     install_requires=[
31 |         "pyspark==" + os.environ.get('SPARK_VERSION', "2.2.1"),
32 |         "bblfsh==2.9.13"
33 |     ],
34 |     classifiers=[
35 |         "Development Status :: 2 - Pre-Alpha",
36 |         "Intended Audience :: Developers",
37 |         "License :: OSI Approved :: Apache Software License",
38 |         "Topic :: Scientific/Engineering :: Information Analysis",
39 |         "Programming Language :: Python :: 2.7",
40 |         "Programming Language :: Python :: 3"
41 |     ]
42 | )
43 | 


--------------------------------------------------------------------------------
/python/sourced/__init__.py:
--------------------------------------------------------------------------------
1 | # You must not include any other code and data in a namespace package's __init__.py
2 | import pkg_resources
3 | pkg_resources.declare_namespace(__name__)
4 | 


--------------------------------------------------------------------------------
/python/sourced/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from importlib import import_module
 3 | from bblfsh.sdkversion import VERSION
 4 | 
 5 | from sourced.engine.engine import Engine, SourcedDataFrame
 6 | 
 7 | def parse_uast_node(data):
 8 |     """
 9 |     Parses a byte array and turns it into an UAST node.
10 | 
11 |     >>> parse_uast_node(row["uast"])
12 | 
13 |     :param data: binary-encoded uast as a byte array
14 |     :type data: byte array
15 |     :rtype: UAST node
16 |     """
17 |     return import_module(
18 |         "bblfsh.gopkg.in.bblfsh.sdk.%s.uast.generated_pb2" % VERSION)\
19 |         .Node.FromString(data)


--------------------------------------------------------------------------------
/python/sourced/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/python/sourced/examples/__init__.py


--------------------------------------------------------------------------------
/python/sourced/examples/basic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sourced.engine import Engine
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | def main():
 6 |     file_path = os.path.dirname(os.path.realpath(__file__))
 7 |     repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
 8 |     session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
 9 |     engine = Engine(session, repos_path, "siva")
10 |     engine.repositories.references.master_ref.commits.show()
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     main()
15 | 


--------------------------------------------------------------------------------
/python/sourced/examples/repo_files.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | from sourced.engine import Engine
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | 
 7 | def main():
 8 |     file_path = os.path.dirname(os.path.realpath(__file__))
 9 |     repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
10 |     session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
11 |     engine = Engine(session, repos_path, "siva")
12 |     rows = engine.repositories.references.head_ref.commits\
13 |         .tree_entries.select('path').collect()
14 | 
15 |     files = [r['path'] for r in rows]
16 | 
17 |     print("FILES:")
18 |     for f in files:
19 |         print(f)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/python/sourced/examples/repo_references.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | from sourced.engine import Engine
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | 
 7 | def main():
 8 |     file_path = os.path.dirname(os.path.realpath(__file__))
 9 |     repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
10 |     session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
11 |     engine = Engine(session, repos_path, "siva")
12 |     refs = engine.repositories.filter('id = "github.com/xiyou-linuxer/faq-xiyoulinux"')\
13 |         .references.select('name').collect()
14 | 
15 |     refs = [r['name'] for r in refs]
16 | 
17 |     print("REFERENCES:")
18 |     for r in refs:
19 |         print(r)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/python/sourced/examples/repos.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | from sourced.engine import Engine
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | 
 7 | def main():
 8 |     file_path = os.path.dirname(os.path.realpath(__file__))
 9 |     repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
10 |     session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
11 |     engine = Engine(session, repos_path, "siva")
12 |     rows = engine.repositories.select('id').collect()
13 | 
14 |     repos = [r['id'] for r in rows]
15 | 
16 |     print("REPOS:")
17 |     for r in repos:
18 |         print(r)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/python/sourced/examples/uasts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sourced.engine import Engine
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | def main():
 6 |     file_path = os.path.dirname(os.path.realpath(__file__))
 7 |     repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
 8 |     session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
 9 |     engine = Engine(session, repos_path, "siva")
10 |     engine.repositories.references\
11 |         .filter('name = "refs/heads/develop"')\
12 |         .commits.tree_entries.blobs\
13 |         .classify_languages()\
14 |         .filter('lang = "Ruby"')\
15 |         .extract_uasts()\
16 |         .show()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 


--------------------------------------------------------------------------------
/python/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/python/test/__init__.py


--------------------------------------------------------------------------------
/python/test/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import realpath, dirname, join
 3 | from glob import glob
 4 | 
 5 | jars_path = join(dirname(dirname(realpath(__file__))), "jars")
 6 | jars = ':'.join(glob(join(jars_path, '*.jar')))
 7 | os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars %s pyspark-shell' % jars
 8 | 
 9 | from pyspark.sql import SparkSession
10 | from unittest import TestCase
11 | 
12 | 
13 | class BaseTestCase(TestCase):
14 |     def setUp(self):
15 |         self.session = SparkSession.builder.appName("test").master("local[*]").getOrCreate()


--------------------------------------------------------------------------------
/python/test/test_sourced_dataframe.py:
--------------------------------------------------------------------------------
 1 | from sourced.engine import SourcedDataFrame
 2 | from .base import BaseTestCase
 3 | 
 4 | 
 5 | class SourcedDataFrameTestCase(BaseTestCase):
 6 |     def setUp(self):
 7 |         BaseTestCase.setUp(self)
 8 |         df = self.session.createDataFrame([('Alice', 18), ('Amy', 23), ('Cole', 22), ('Aaron', 25), ('Sue', 52)])
 9 |         self.df = SourcedDataFrame(df._jdf, self.session, None)
10 | 
11 | 
12 |     def test_filter(self):
13 |         self.assert_names(self.df.filter(self.df[1] % 2 == 0),
14 |                           ['Alice', 'Cole', 'Sue'])
15 | 
16 |     def test_sort(self):
17 |         self.assert_names(self.df.sort(self.df[1]),
18 |                           ['Alice', 'Cole', 'Amy', 'Aaron', 'Sue'])
19 | 
20 | 
21 |     def assert_names(self, df, names):
22 |         result = [r[0] for r in df.select(df[0]).collect()]
23 |         self.assertEqual(result, names)


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <scalastyle>
  2 |  <name>Scalastyle standard configuration</name>
  3 |  <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
  4 |  <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
  5 |   <parameters>
  6 |    <parameter name="maxFileLength"><![CDATA[800]]></parameter>
  7 |   </parameters>
  8 |  </check>
  9 |  <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="false">
 10 |   <parameters>
 11 |    <parameter name="header"><![CDATA[]]></parameter>
 12 |   </parameters>
 13 |  </check>
 14 |  <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 15 |  <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 16 |  <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 17 |  <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 18 |   <parameters>
 19 |    <parameter name="maxLineLength"><![CDATA[100]]></parameter>
 20 |    <parameter name="tabSize"><![CDATA[2]]></parameter>
 21 |   </parameters>
 22 |  </check>
 23 |  <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 24 |   <parameters>
 25 |    <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 26 |   </parameters>
 27 |  </check>
 28 |  <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 29 |   <parameters>
 30 |    <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 31 |   </parameters>
 32 |  </check>
 33 |  <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 34 |   <parameters>
 35 |    <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
 36 |   </parameters>
 37 |  </check>
 38 |  <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
 39 |  <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
 40 |   <parameters>
 41 |    <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
 42 |   </parameters>
 43 |  </check>
 44 |  <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 45 |   <parameters>
 46 |    <parameter name="maxParameters"><![CDATA[8]]></parameter>
 47 |   </parameters>
 48 |  </check>
 49 |  <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
 50 |   <parameters>
 51 |    <parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
 52 |   </parameters>
 53 |  </check>
 54 |  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
 55 |  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
 56 |  <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
 57 |  <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
 58 |  <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
 59 |  <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
 60 |  <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
 61 |  <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
 62 |  <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
 63 |   <parameters>
 64 |    <parameter name="regex"><![CDATA[println]]></parameter>
 65 |   </parameters>
 66 |  </check>
 67 |  <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true">
 68 |   <parameters>
 69 |    <parameter name="maxTypes"><![CDATA[30]]></parameter>
 70 |   </parameters>
 71 |  </check>
 72 |  <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true">
 73 |   <parameters>
 74 |    <parameter name="maximum"><![CDATA[25]]></parameter>
 75 |   </parameters>
 76 |  </check>
 77 |  <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
 78 |  <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
 79 |  <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
 80 |   <parameters>
 81 |    <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
 82 |    <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
 83 |   </parameters>
 84 |  </check>
 85 |  <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
 86 |   <parameters>
 87 |    <parameter name="maxLength"><![CDATA[150]]></parameter>
 88 |   </parameters>
 89 |  </check>
 90 |  <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="true">
 91 |   <parameters>
 92 |    <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
 93 |   </parameters>
 94 |  </check>
 95 |  <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
 96 |   <parameters>
 97 |    <parameter name="maxMethods"><![CDATA[30]]></parameter>
 98 |   </parameters>
 99 |  </check>
100 |  <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
101 |  <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
102 |  <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
103 | </scalastyle>
104 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/UtilsWrapper.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark
2 | 
3 | import org.apache.spark.util.Utils
4 | 
5 | object UtilsWrapper {
6 |   def getLocalDir(conf: SparkConf): String = Utils.getLocalDir(conf)
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/DefaultSource.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import org.apache.spark.groupon.metrics.UserMetricsSystem
  4 | import org.apache.spark.rdd.RDD
  5 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
  6 | import org.apache.spark.sql.sources._
  7 | import org.apache.spark.sql.types.StructType
  8 | import org.apache.spark.sql.{Row, SQLContext, SparkSession}
  9 | import org.apache.spark.{SparkException, UtilsWrapper}
 10 | import tech.sourced.engine.iterator._
 11 | import tech.sourced.engine.provider.{RepositoryProvider, RepositoryRDDProvider}
 12 | 
 13 | /**
 14 |   * Default source to provide new git relations.
 15 |   */
 16 | class DefaultSource extends RelationProvider with DataSourceRegister {
 17 | 
 18 |   /** @inheritdoc */
 19 |   override def shortName: String = "git"
 20 | 
 21 |   /** @inheritdoc */
 22 |   override def createRelation(sqlContext: SQLContext,
 23 |                               parameters: Map[String, String]): BaseRelation = {
 24 |     val table = parameters.getOrElse(
 25 |       DefaultSource.TableNameKey,
 26 |       throw new SparkException("parameter 'table' must be provided")
 27 |     )
 28 | 
 29 |     val schema: StructType = Schema(table)
 30 | 
 31 |     GitRelation(sqlContext.sparkSession, schema, tableSource = Some(table))
 32 |   }
 33 | 
 34 | }
 35 | 
 36 | /**
 37 |   * Just contains some useful constants for the DefaultSource class to use.
 38 |   */
 39 | object DefaultSource {
 40 |   val TableNameKey = "table"
 41 |   val PathKey = "path"
 42 | }
 43 | 
 44 | /**
 45 |   * A relation based on git data from rooted repositories in siva files. The data this relation
 46 |   * will offer depends on the given `tableSource`, which controls the table that will be accessed.
 47 |   * Also, the [[tech.sourced.engine.rule.GitOptimizer]] might merge some table sources into one by
 48 |   * squashing joins, so the result will be the resultant table chained with the previous one using
 49 |   * chained iterators.
 50 |   *
 51 |   * @param session        Spark session
 52 |   * @param schema         schema of the relation
 53 |   * @param joinConditions join conditions, if any
 54 |   * @param tableSource    source table if any
 55 |   */
 56 | case class GitRelation(session: SparkSession,
 57 |                        schema: StructType,
 58 |                        joinConditions: Option[Expression] = None,
 59 |                        tableSource: Option[String] = None)
 60 |   extends BaseRelation with CatalystScan {
 61 | 
 62 |   private val localPath: String = UtilsWrapper.getLocalDir(session.sparkContext.getConf)
 63 |   private val path: String = session.conf.get(RepositoriesPathKey)
 64 |   private val repositoriesFormat: String = session.conf.get(RepositoriesFormatKey)
 65 |   private val skipCleanup: Boolean = session.conf.
 66 |     get(SkipCleanupKey, default = "false").toBoolean
 67 |   private val skipReadErrors: Boolean = session.conf.
 68 |     get(SkipReadErrorsKey, default = "false").toBoolean
 69 |   private val parallelism: Int = session.sparkContext.defaultParallelism
 70 | 
 71 |   // this needs to be overridden to extend BaseRelataion,
 72 |   // though is not very useful since already we have the SparkSession
 73 |   override def sqlContext: SQLContext = session.sqlContext
 74 | 
 75 |   override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
 76 |     super.unhandledFilters(filters)
 77 |   }
 78 | 
 79 |   override def buildScan(requiredColumns: Seq[Attribute],
 80 |                          filters: Seq[Expression]): RDD[Row] = {
 81 |     val sc = session.sparkContext
 82 |     val reposRDD = RepositoryRDDProvider(sc).get(path, repositoriesFormat)
 83 | 
 84 |     val requiredCols = sc.broadcast(requiredColumns.map(_.name).toArray)
 85 |     val reposLocalPath = sc.broadcast(localPath)
 86 |     val sources = sc.broadcast(Sources.getSources(tableSource, schema))
 87 |     val filtersBySource = sc.broadcast(Sources.getFiltersBySource(filters))
 88 | 
 89 |     reposRDD.flatMap(source => {
 90 |       val provider = RepositoryProvider(reposLocalPath.value, skipCleanup, parallelism * 2)
 91 | 
 92 |       val repo = UserMetricsSystem.timer("RepositoryProvider").time({
 93 |         provider.get(source)
 94 |       })
 95 | 
 96 |       // since the sources are ordered by their hierarchy, we can chain them like this
 97 |       // using the last used iterator as input for the current one
 98 |       var iter: Option[ChainableIterator[_]] = None
 99 |       sources.value.foreach({
100 |         case k@"repositories" =>
101 |           iter = Some(new RepositoryIterator(
102 |             source.root,
103 |             requiredCols.value,
104 |             repo,
105 |             filtersBySource.value.getOrElse(k, Seq()),
106 |             skipReadErrors
107 |           ))
108 | 
109 |         case k@"references" =>
110 |           iter = Some(new ReferenceIterator(
111 |             requiredCols.value,
112 |             repo,
113 |             iter.map(_.asInstanceOf[RepositoryIterator]).orNull,
114 |             filtersBySource.value.getOrElse(k, Seq()),
115 |             skipReadErrors
116 |           ))
117 | 
118 |         case k@"commits" =>
119 |           iter = Some(new CommitIterator(
120 |             requiredCols.value,
121 |             repo,
122 |             iter.map(_.asInstanceOf[ReferenceIterator]).orNull,
123 |             filtersBySource.value.getOrElse(k, Seq()),
124 |             skipReadErrors
125 |           ))
126 | 
127 |         case k@"tree_entries" =>
128 |           iter = Some(new GitTreeEntryIterator(
129 |             requiredCols.value,
130 |             repo,
131 |             iter.map(_.asInstanceOf[CommitIterator]).orNull,
132 |             filtersBySource.value.getOrElse(k, Seq()),
133 |             skipReadErrors
134 |           ))
135 | 
136 |         case k@"blobs" =>
137 |           iter = Some(new BlobIterator(
138 |             requiredCols.value,
139 |             repo,
140 |             iter.map(_.asInstanceOf[GitTreeEntryIterator]).orNull,
141 |             filtersBySource.value.getOrElse(k, Seq()),
142 |             skipReadErrors
143 |           ))
144 | 
145 |         case other => throw new SparkException(s"required cols for '$other' is not supported")
146 |       })
147 | 
148 |       // FIXME: when the RDD is persisted to disk the last element of this iterator is closed twice
149 |       new CleanupIterator(iter.getOrElse(Seq().toIterator), provider.close(source, repo))
150 |     })
151 |   }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/Schema.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import org.apache.spark.SparkException
  4 | import org.apache.spark.sql.catalyst.expressions.Attribute
  5 | import org.apache.spark.sql.types._
  6 | 
  7 | /**
  8 |   * Schema contains all the schemas of the multiple tables offered by this library.
  9 |   */
 10 | private[engine] object Schema {
 11 | 
 12 |   /**
 13 |     * Repositories table schema. Contains just the identifier of the repository,
 14 |     * its URLs and whether it's a fork or not.
 15 |     */
 16 |   val repositories = StructType(
 17 |     StructField("id", StringType, nullable = false) ::
 18 |       StructField("urls", ArrayType(StringType, containsNull = false), nullable = false) ::
 19 |       StructField("is_fork", BooleanType) ::
 20 |       StructField("repository_path", StringType) ::
 21 |       Nil
 22 |   )
 23 | 
 24 |   /**
 25 |     * References table schema containing the repository to which they belong,
 26 |     * the name and the hash of the object they point to.
 27 |     */
 28 |   val references = StructType(
 29 |     StructField("repository_id", StringType, nullable = false) ::
 30 |       StructField("name", StringType, nullable = false) ::
 31 |       StructField("hash", StringType, nullable = false) ::
 32 |       StructField("is_remote", BooleanType, nullable = false) ::
 33 |       Nil
 34 |   )
 35 | 
 36 |   /**
 37 |     * Commits table schema containing all the data about commits.
 38 |     */
 39 |   val commits = StructType(
 40 |     StructField("repository_id", StringType, nullable = false) ::
 41 |       StructField("reference_name", StringType, nullable = false) ::
 42 |       StructField("index", IntegerType, nullable = false) ::
 43 |       StructField("hash", StringType, nullable = false) ::
 44 |       StructField("message", StringType, nullable = false) ::
 45 |       StructField("parents", ArrayType(StringType, containsNull = false)) ::
 46 |       StructField("parents_count", IntegerType, nullable = false) ::
 47 | 
 48 |       StructField("author_email", StringType) ::
 49 |       StructField("author_name", StringType) ::
 50 |       StructField("author_date", TimestampType) ::
 51 | 
 52 |       StructField("committer_email", StringType) ::
 53 |       StructField("committer_name", StringType) ::
 54 |       StructField("committer_date", TimestampType) ::
 55 | 
 56 |       Nil
 57 |   )
 58 | 
 59 |   /**
 60 |     * Tree Entries table schema containing all the tree entries data.
 61 |     */
 62 |   val treeEntries = StructType(
 63 |     StructField("commit_hash", StringType, nullable = false) ::
 64 |       StructField("repository_id", StringType, nullable = false) ::
 65 |       StructField("reference_name", StringType, nullable = false) ::
 66 |       StructField("path", StringType, nullable = false) ::
 67 |       StructField("blob", StringType, nullable = false) ::
 68 |       Nil
 69 |   )
 70 | 
 71 |   /**
 72 |     * Blobs table schema containing all the blobs data.
 73 |     */
 74 |   val blobs = StructType(
 75 |     StructField("blob_id", StringType, nullable = false) ::
 76 |       StructField("commit_hash", StringType, nullable = false) ::
 77 |       StructField("repository_id", StringType, nullable = false) ::
 78 |       StructField("reference_name", StringType, nullable = false) ::
 79 |       StructField("content", BinaryType) ::
 80 |       StructField("is_binary", BooleanType, nullable = false) ::
 81 |       Nil
 82 |   )
 83 | 
 84 |   /**
 85 |     * Return the schema for the table with the given name. Throws a SparkException
 86 |     * if there is no schema for the given table.
 87 |     *
 88 |     * @param table name
 89 |     * @return schema for the table
 90 |     * @throws SparkException if the table does not exist
 91 |     */
 92 |   def apply(table: String): StructType = table match {
 93 |     case "repositories" => Schema.repositories
 94 |     case "references" => Schema.references
 95 |     case "commits" => Schema.commits
 96 |     case "tree_entries" => Schema.treeEntries
 97 |     case "blobs" => Schema.blobs
 98 |     case other => throw new SparkException(s"table '$other' is not supported")
 99 |   }
100 | 
101 |   /**
102 |     * Returns a tuple with the table and column names for the given attribute.
103 |     * Because metadata tables are different from git relation tables, some fields
104 |     * need to be mapped to match one schema with the other.
105 |     *
106 |     * @param attr attribute from the git relation schema
107 |     * @return table and column names
108 |     */
109 |   def metadataTableAndCol(attr: Attribute): (String, String) = {
110 |     val name = attr.name
111 |     val table = attr.metadata.getString(Sources.SourceKey)
112 |     metadataMappings(table, name).getOrElse((table, name))
113 |   }
114 | 
115 |   /**
116 |     * Mappings between a table name and column name in the git relation schema
117 |     * and their counterpart in the metadata schema.
118 |     *
119 |     * @param table table name
120 |     * @param name  column name
121 |     * @return a tuple with table and column name or None if there is no mapping
122 |     */
123 |   def metadataMappings(table: String, name: String): Option[(String, String)] =
124 |     Option((table, name) match {
125 |       case ("commits", "index") =>
126 |         (RepositoryHasCommitsTable, "index")
127 |       case ("commits", "repository_id") =>
128 |         (RepositoryHasCommitsTable, "repository_id")
129 |       case ("commits", "reference_name") =>
130 |         (RepositoryHasCommitsTable, "reference_name")
131 |       case ("tree_entries", "repository_id") =>
132 |         (RepositoryHasCommitsTable, "repository_id")
133 |       case ("tree_entries", "reference_name") =>
134 |         (RepositoryHasCommitsTable, "reference_name")
135 |       case _ => null
136 |     })
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/Sources.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import org.apache.spark.sql.catalyst.expressions.Expression
 4 | import org.apache.spark.sql.types.StructType
 5 | import tech.sourced.engine.util.{CompiledFilter, Filter}
 6 | 
 7 | /**
 8 |   * Defines the hierarchy between data sources.
 9 |   */
10 | object Sources {
11 | 
12 |   val SourceKey: String = "source"
13 | 
14 |   /** Sources ordered by their position in the hierarchy. */
15 |   val orderedSources = Array(
16 |     "repositories",
17 |     "references",
18 |     "commits",
19 |     "tree_entries",
20 |     "blobs"
21 |   )
22 | 
23 |   /**
24 |     * Compares two sources.
25 |     *
26 |     * @param a first source
27 |     * @param b second source
28 |     * @return comparison result
29 |     */
30 |   def compare(a: String, b: String): Int = orderedSources.indexOf(a)
31 |     .compareTo(orderedSources.indexOf(b))
32 | 
33 |   /**
34 |     * Returns the list of sources in the schema or the table source if any.
35 |     *
36 |     * @param tableSource optional source table
37 |     * @param schema      resultant schema
38 |     * @return sequence with table sources
39 |     */
40 |   def getSources(tableSource: Option[String],
41 |                  schema: StructType): Seq[String] =
42 |     tableSource match {
43 |       case Some(ts) => Seq(ts)
44 |       case None =>
45 |         schema
46 |           .map(_.metadata.getString(SourceKey))
47 |           .distinct
48 |           .sortWith(Sources.compare(_, _) < 0)
49 |     }
50 | 
51 |   def getFiltersBySource(filters: Seq[Expression]): Map[String, Seq[CompiledFilter]] =
52 |     filters.flatMap(Filter.compile)
53 |       .map(e => (e.sources.distinct, e))
54 |       .filter(_._1.lengthCompare(1) == 0)
55 |       .groupBy(_._1)
56 |       .map { case (k, v) => (k.head, v.map(_._2)) }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/TableBuilder.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import java.sql.{DriverManager, PreparedStatement}
  4 | 
  5 | import org.apache.spark.SparkException
  6 | import org.apache.spark.internal.Logging
  7 | import org.apache.spark.sql.types._
  8 | 
  9 | private[engine] object TableBuilder {
 10 |   def columnSql(field: StructField): String =
 11 |     s"`${field.name}` ${sqlType(field.dataType)}" +
 12 |       (if (!field.nullable) s" NOT NULL" else "")
 13 | 
 14 |   def pkSql(cols: Seq[String]): Option[String] = if (cols.isEmpty) {
 15 |     None
 16 |   } else {
 17 |     Some(s"PRIMARY KEY (${cols.map(v => s"`$v`").mkString(", ")})")
 18 |   }
 19 | 
 20 |   def indexSql(table: String, col: String): String =
 21 |     s"CREATE INDEX `${table}_${col}_idx` ON $table(`$col`)"
 22 | 
 23 |   def sqlType(dt: DataType): String = dt match {
 24 |     case IntegerType => "INTEGER"
 25 |     case LongType | TimestampType => "BIGINT"
 26 |     case DoubleType => "DOUBLE PRECISION"
 27 |     case FloatType => "REAL"
 28 |     case ShortType | ByteType => "TINYINT"
 29 |     case StringType => "TEXT"
 30 |     case _ => throw new SparkException(s"there is no SQLite type for datatype $dt")
 31 |   }
 32 | }
 33 | 
 34 | private[engine] case class Table(name: String,
 35 |                                  pks: Seq[String],
 36 |                                  indexes: Seq[String]) extends Logging {
 37 |   private def sql(schema: StructType): Seq[String] = {
 38 |     Seq(s"CREATE TABLE $name (" +
 39 |       (schema.map(TableBuilder.columnSql) ++ TableBuilder.pkSql(pks)).mkString(",\n")
 40 |       + s")") ++
 41 |       pks.map(TableBuilder.indexSql(name, _))
 42 |   }
 43 | 
 44 |   def create(dbPath: String, schema: StructType): Unit = {
 45 |     val conn = DriverManager.getConnection(s"jdbc:sqlite:$dbPath")
 46 |     conn.setAutoCommit(false)
 47 |     try {
 48 |       sql(schema).foreach(sql => {
 49 |         log.debug(s"executing SQL statement for table `$name`: `$sql`")
 50 |         var stmt: PreparedStatement = null
 51 |         try {
 52 |           stmt = conn.prepareStatement(sql)
 53 |           stmt.execute()
 54 |         } finally {
 55 |           if (stmt != null) {
 56 |             stmt.close()
 57 |           }
 58 |         }
 59 |       })
 60 |       conn.commit()
 61 |     } catch {
 62 |       case e: Exception =>
 63 |         log.error(s"unable to create table $name and its indexes", e)
 64 |         conn.rollback()
 65 |     } finally {
 66 |       conn.close()
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | object Tables {
 72 | 
 73 |   val repositories = Table(
 74 |     prefix("repositories"),
 75 |     Seq("id"),
 76 |     Seq("repository_path")
 77 |   )
 78 | 
 79 |   val references = Table(
 80 |     prefix("references"),
 81 |     Seq("name", "repository_id"),
 82 |     Seq()
 83 |   )
 84 | 
 85 |   val commits = Table(
 86 |     prefix("commits"),
 87 |     Seq("hash"),
 88 |     Seq()
 89 |   )
 90 | 
 91 |   val repoHasCommits = Table(
 92 |     prefix("repository_has_commits"),
 93 |     Seq("hash", "repository_id", "reference_name"),
 94 |     Seq("index")
 95 |   )
 96 | 
 97 |   val treeEntries = Table(
 98 |     prefix("tree_entries"),
 99 |     // blob id can point to several paths, so we need this overly complex composite pk
100 |     Seq("blob", "path", "commit_hash"),
101 |     Seq()
102 |   )
103 | 
104 |   def apply(name: String): Table = name match {
105 |     case "repositories" => repositories
106 |     case "references" => references
107 |     case "commits" => commits
108 |     case "repository_has_commits" => repoHasCommits
109 |     case "tree_entries" => treeEntries
110 |   }
111 | 
112 |   def prefix(name: String): String = s"engine_$name"
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/compat/compat.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.compat
 2 | 
 3 | import org.apache.spark.SPARK_VERSION
 4 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
 5 | import org.apache.spark.sql.catalyst.expressions.AttributeReference
 6 | import org.apache.spark.sql.execution.datasources.{
 7 |   LogicalRelation => SparkLogicalRelation
 8 | }
 9 | import org.apache.spark.sql.sources.BaseRelation
10 | 
11 | import scala.reflect.runtime.{universe => ru}
12 | 
13 | private[compat] object Compat {
14 | 
15 |   def apply[T](s22: T, s23: T): T = SPARK_VERSION match {
16 |     case s if s.startsWith("2.2.") => s22
17 |     case s if s.startsWith("2.3.") => s23
18 |     case _ =>
19 |       throw new RuntimeException(s"Unsupported SPARK_VERSION: $SPARK_VERSION")
20 |   }
21 | 
22 |   lazy val ClassMirror = ru.runtimeMirror(Compat.getClass.getClassLoader)
23 | 
24 | }
25 | 
26 | private[engine] object LogicalRelation {
27 | 
28 |   def apply(rel: BaseRelation,
29 |             out: Seq[AttributeReference],
30 |             catalog: Option[CatalogTable]): SparkLogicalRelation =
31 |     applyImpl(rel, out, catalog)
32 | 
33 |   private lazy val applyImpl =
34 |     Compat(applySpark22(_, _, _), applySpark23(_, _, _))
35 | 
36 |   private lazy val typ = ru.typeOf[SparkLogicalRelation]
37 |   private lazy val classSymbol =
38 |     Compat.ClassMirror.reflectClass(typ.typeSymbol.asClass)
39 |   private lazy val ctor =
40 |     classSymbol.reflectConstructor(typ.decl(ru.termNames.CONSTRUCTOR).asMethod)
41 | 
42 |   def applySpark22(rel: BaseRelation,
43 |                    out: Seq[AttributeReference],
44 |                    catalog: Option[CatalogTable]): SparkLogicalRelation =
45 |     ctor(rel, out, catalog).asInstanceOf[SparkLogicalRelation]
46 | 
47 |   def applySpark23(rel: BaseRelation,
48 |                    out: Seq[AttributeReference],
49 |                    catalog: Option[CatalogTable]): SparkLogicalRelation =
50 |     ctor(rel, out, catalog, false).asInstanceOf[SparkLogicalRelation]
51 | 
52 |   def unapply(arg: SparkLogicalRelation)
53 |     : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] =
54 |     unapplyImpl(arg)
55 | 
56 |   private lazy val unapplyImpl = Compat(unapplySpark22(_), unapplySpark23(_))
57 | 
58 |   def unapplySpark22(arg: SparkLogicalRelation)
59 |     : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] =
60 |     Some((arg.relation, arg.output, arg.catalogTable))
61 | 
62 |   def unapplySpark23(arg: SparkLogicalRelation)
63 |     : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = {
64 |     val isStreaming = Compat.ClassMirror
65 |       .reflect(arg)
66 |       .reflectField(typ.decl(ru.TermName("isStreaming")).asTerm)
67 |       .get
68 |       .asInstanceOf[Boolean]
69 |     if (isStreaming) {
70 |       None
71 |     } else {
72 |       Some((arg.relation, arg.output, arg.catalogTable))
73 |     }
74 |   }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/exception/RepositoryException.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.exception
 2 | 
 3 | import org.eclipse.jgit.lib.Repository
 4 | 
 5 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter
 6 | 
 7 | /**
 8 |   * Exception to add repository debug information to any
 9 |   * uncontrolled exception. It does not add a stacktrace level.
10 |   *
11 |   * @param repo Repository that was beeing iterated
12 |   * @param cause Original exception
13 |   */
14 | class RepositoryException(repo: Repository, cause: Throwable)
15 |     extends Exception(
16 |       s"Repository error with data: ${RepositoryException.repoInfo(repo)}",
17 |       cause,
18 |       true,
19 |       false
20 |     ) {}
21 | 
22 | object RepositoryException {
23 | 
24 |   def apply(repo: Repository, cause: Throwable): RepositoryException = {
25 |     new RepositoryException(repo, cause)
26 |   }
27 | 
28 |   /**
29 |     * Returns a string with a debug description of the repository
30 |     * @param repo Repository to describe
31 |     * @return
32 |     */
33 |   def repoInfo(repo: Repository): String = {
34 |     val repoPath = try {
35 |       repo.toString
36 |     } catch {
37 |       case _: Throwable => "Unknown repository path"
38 |     }
39 | 
40 |     try {
41 |       val c = repo.getConfig
42 |       val remotes = c.getSubsections("remote").asScala
43 |       val urls = remotes.flatMap(r => c.getStringList("remote", r, "url"))
44 | 
45 |       if (urls.isEmpty) {
46 |         repoPath
47 |       } else {
48 |         s"$repoPath; urls ${urls.toSet.mkString(", ")}"
49 |       }
50 |     } catch {
51 |       case e: Throwable =>
52 |         s"Exception in RepositoryException.repoInfo for $repoPath: ${e.getMessage}"
53 |     }
54 |   }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/BlobIterator.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import org.apache.spark.internal.Logging
  4 | import org.eclipse.jgit.diff.RawText
  5 | import org.eclipse.jgit.errors.MissingObjectException
  6 | import org.eclipse.jgit.lib.{ObjectId, Repository}
  7 | import tech.sourced.engine.exception.RepositoryException
  8 | import tech.sourced.engine.util.{CompiledFilter, Filters}
  9 | 
 10 | /**
 11 |   * Iterator that will return rows of blobs in a repository.
 12 |   *
 13 |   * @param finalColumns final columns that must be in the resultant row
 14 |   * @param repo         repository to get the data from
 15 |   * @param prevIter     previous iterator, if the iterator is chained
 16 |   * @param filters      filters for the iterator
 17 |   */
 18 | class BlobIterator(finalColumns: Array[String],
 19 |                    repo: Repository,
 20 |                    prevIter: TreeEntryIterator,
 21 |                    filters: Seq[CompiledFilter],
 22 |                    skipReadErrors: Boolean)
 23 |   extends ChainableIterator[Blob](
 24 |     finalColumns,
 25 |     Option(prevIter).orNull,
 26 |     filters,
 27 |     repo,
 28 |     skipReadErrors
 29 |   ) with Logging {
 30 | 
 31 |   /** @inheritdoc*/
 32 |   override protected def loadIterator(compiledFilters: Seq[CompiledFilter]): Iterator[Blob] = {
 33 |     val filters = Filters(compiledFilters)
 34 |     val treeEntryIter = Option(prevIter) match {
 35 |       case Some(it) =>
 36 |         Seq(it.currentRow).toIterator
 37 |       case None => GitTreeEntryIterator.loadIterator(
 38 |         repo,
 39 |         None,
 40 |         filters,
 41 |         blobIdKey = "blob_id"
 42 |       )
 43 |     }
 44 | 
 45 |     val iter = treeEntryIter.flatMap(entry => {
 46 |       if (repo.hasObject(entry.blob)) {
 47 |         Some(
 48 |           Blob(
 49 |             entry.blob,
 50 |             entry.commitHash,
 51 |             entry.ref,
 52 |             entry.repo,
 53 |             BlobIterator.readFile(
 54 |               entry.blob,
 55 |               repo
 56 |             )
 57 |           ))
 58 |       } else {
 59 |         None
 60 |       }
 61 |     })
 62 | 
 63 |     if (filters.hasFilters("blob_id")) {
 64 |       iter.filter(b => filters.matches(Seq("blob_id"), b.id.getName))
 65 |     } else {
 66 |       iter
 67 |     }
 68 |   }
 69 | 
 70 |   override protected def mapColumns(blob: Blob): RawRow = {
 71 |     val isBinary = RawText.isBinary(blob.content)
 72 | 
 73 |     Map[String, Any](
 74 |       "commit_hash" -> blob.commit.getName,
 75 |       "repository_id" -> blob.repo,
 76 |       "reference_name" -> blob.ref,
 77 |       "blob_id" -> blob.id.getName,
 78 |       "content" -> (if (isBinary) Array.emptyByteArray else blob.content),
 79 |       "is_binary" -> isBinary
 80 |     )
 81 |   }
 82 | 
 83 | }
 84 | 
 85 | case class Blob(id: ObjectId,
 86 |                 commit: ObjectId,
 87 |                 ref: String,
 88 |                 repo: String,
 89 |                 content: Array[Byte])
 90 | 
 91 | object BlobIterator extends Logging {
 92 |   /** Max bytes to read for the content of a file. */
 93 |   val readMaxBytes: Int = 20 * 1024 * 1024
 94 | 
 95 |   /**
 96 |     * Read max N bytes of the given blob
 97 |     *
 98 |     * @param objId ID of the object to read
 99 |     * @param repo  repository to get the data from
100 |     * @param max   maximum number of bytes to read in memory
101 |     * @return Bytearray with the contents of the file
102 |     */
103 |   def readFile(objId: ObjectId, repo: Repository, max: Integer = readMaxBytes): Array[Byte] = {
104 |     val reader = repo.newObjectReader()
105 |     val obj = try {
106 |       reader.open(objId)
107 |     } catch {
108 |       case e: MissingObjectException =>
109 |         log.warn(s"missing object", new RepositoryException(repo, e))
110 |         null
111 |     }
112 | 
113 |     if (obj != null) {
114 |       val data = if (obj.isLarge) {
115 |         val buf = Array.ofDim[Byte](max)
116 |         val is = obj.openStream()
117 |         is.read(buf)
118 |         is.close()
119 |         buf
120 |       } else {
121 |         obj.getBytes
122 |       }
123 |       reader.close()
124 |       data
125 |     } else {
126 |       Array.emptyByteArray
127 |     }
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/ChainableIterator.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import org.apache.spark.internal.Logging
  4 | import org.apache.spark.sql.Row
  5 | import org.eclipse.jgit.api.errors.GitAPIException
  6 | import org.eclipse.jgit.errors.{
  7 |   IncorrectObjectTypeException,
  8 |   MissingObjectException,
  9 |   RevWalkException
 10 | }
 11 | import org.eclipse.jgit.lib.Repository
 12 | import tech.sourced.engine.exception.RepositoryException
 13 | import tech.sourced.engine.util.CompiledFilter
 14 | 
 15 | import scala.annotation.tailrec
 16 | 
 17 | /**
 18 |   * Iterator that can have a previous iterator to output chained values.
 19 |   *
 20 |   * @param finalColumns final columns that must be in the resultant row
 21 |   * @param prevIter     previous iterator, if the iterator is chained
 22 |   * @param filters      filters for the iterator
 23 |   * @tparam T type of data returned by the internal iterator
 24 |   */
 25 | abstract class ChainableIterator[T](finalColumns: Array[String],
 26 |                                     prevIter: ChainableIterator[_],
 27 |                                     filters: Seq[CompiledFilter],
 28 |                                     val repo: Repository,
 29 |                                     skipReadErrors: Boolean
 30 |                                    ) extends Iterator[Row] with Logging {
 31 | 
 32 |   /** Raw values of the row. */
 33 |   type RawRow = Map[String, Any]
 34 | 
 35 |   /** Instance of the internal iterator. */
 36 |   private var iter: Iterator[T] = _
 37 | 
 38 |   /** The current row of the prevIter, null always if there is no prevIter. */
 39 |   private var prevIterCurrentRow: RawRow = _
 40 | 
 41 |   /** The current row of the internal iterator. */
 42 |   private[iterator] var currentRow: T = _
 43 | 
 44 |   /**
 45 |     * Returns the internal iterator that will return the data used to construct the final row.
 46 |     *
 47 |     * @param filters filters for the iterator
 48 |     * @return internal iterator
 49 |     */
 50 |   protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[T]
 51 | 
 52 |   /**
 53 |     * Loads the next internal iterator.
 54 |     *
 55 |     * @return internal iterator
 56 |     */
 57 |   private def loadIterator: Iterator[T] = loadIterator(filters)
 58 | 
 59 |   /**
 60 |     * Given the object returned by the internal iterator, this method must transform
 61 |     * that object into a RawRow.
 62 |     *
 63 |     * @param obj object returned by the internal iterator
 64 |     * @return raw row
 65 |     */
 66 |   protected def mapColumns(obj: T): RawRow
 67 | 
 68 |   @tailrec
 69 |   final override def hasNext: Boolean = {
 70 |     loadNext match {
 71 |       case Some(v) => v
 72 |       case None => hasNext
 73 |     }
 74 |   }
 75 | 
 76 |   /**
 77 |     * Load the next iterator and returns if there is a next item or not. If
 78 |     * it returns some value it means we know for sure there is something or
 79 |     * not. If it returns None, it means we don't know and another call to
 80 |     * loadNext is required.
 81 |     *
 82 |     * @return whether there is a next item in the iterator or not, or if we
 83 |     *         don't know
 84 |     */
 85 |   final def loadNext: Option[Boolean] = {
 86 |     try {
 87 |       // If there is no previous iter just load the iterator the first pass
 88 |       // and use hasNext of iter all the times. We return here to get rid of
 89 |       // this logic and assume from this point on that prevIter is not null
 90 |       if (prevIter == null) {
 91 |         if (iter == null) {
 92 |           iter = loadIterator
 93 |         }
 94 | 
 95 |         return Some(iter.hasNext)
 96 |       }
 97 | 
 98 |       // If the iter is not loaded, do so, but only if there are actually more
 99 |       // rows in the prev iter. If there are, just load the iter and preload
100 |       // the prevIterCurrentRow.
101 |       if (iter == null) {
102 |         if (prevIter.isEmpty) {
103 |           return Some(false)
104 |         }
105 | 
106 |         prevIterCurrentRow = prevIter.nextRaw
107 |         iter = loadIterator
108 |       }
109 | 
110 |       // if iter is empty, we need to check if there are more rows in the prev iter
111 |       // if not, just finish. If there are, preload the next raw row of the prev iter
112 |       // and load the iterator again for the prev iter current row
113 |       if (iter.hasNext) {
114 |         Some(true)
115 |       } else {
116 |         if (prevIter.isEmpty) {
117 |           return Some(false)
118 |         }
119 | 
120 |         prevIterCurrentRow = prevIter.nextRaw
121 |         iter = loadIterator
122 | 
123 |         None
124 |       }
125 |     } catch {
126 |       case e: IncorrectObjectTypeException =>
127 |         log.debug("incorrect object type", new RepositoryException(repo, e))
128 |         None
129 |       case e: MissingObjectException =>
130 |         log.warn("missing object", new RepositoryException(repo, e))
131 |         None
132 |       case e: RevWalkException =>
133 |         log.warn("rev walk exception", new RepositoryException(repo, e))
134 |         None
135 |       case e: GitAPIException =>
136 |         log.warn("git api exception", new RepositoryException(repo, e))
137 |         None
138 |       case e@(_: Exception | _: RuntimeException) =>
139 |         if (skipReadErrors) {
140 |           log.warn("read error skipped", new RepositoryException(repo, e))
141 |           None
142 |         } else {
143 |           throw new RepositoryException(repo, e)
144 |         }
145 |       case e: Throwable =>
146 |         throw e
147 |     }
148 |   }
149 | 
150 |   override def next: Row = {
151 |     currentRow = iter.next
152 |     // FIXME: if there's a repeated column name, value
153 |     // will be the last one added. This could be solved by
154 |     // qualifying all column names with their source.
155 |     val mappedValues = if (prevIterCurrentRow != null) {
156 |       prevIterCurrentRow ++ mapColumns(currentRow)
157 |     } else {
158 |       mapColumns(currentRow)
159 |     }
160 | 
161 |     val values = finalColumns.map(c => mappedValues(c))
162 |     Row(values: _*)
163 |   }
164 | 
165 | 
166 |   def nextRaw: RawRow = {
167 |     currentRow = iter.next
168 |     val row = mapColumns(currentRow)
169 |     if (prevIterCurrentRow != null) {
170 |       prevIterCurrentRow ++ row
171 |     } else {
172 |       row
173 |     }
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/CleanupIterator.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import org.apache.spark.{InterruptibleIterator, TaskContext}
 4 | import tech.sourced.engine.exception.RepositoryException
 5 | 
 6 | /**
 7 |   * Iterator that calls a cleanup function after the given iterator has
 8 |   * finished or an exception has been thrown.
 9 |   *
10 |   * @param it      internal iterator
11 |   * @param cleanup cleanup function
12 |   * @tparam T type of the rows in the iterator
13 |   */
14 | class CleanupIterator[T](it: Iterator[T], cleanup: => Unit)
15 |     extends InterruptibleIterator[T](TaskContext.get(), it) {
16 | 
17 |   /** @inheritdoc
18 |     *
19 |     * After catching an exception cleans up all the resources calling the cleanup function
20 |     * and will rethrow such exception again.
21 |     */
22 |   override def hasNext: Boolean = {
23 |     try {
24 |       val hasNext = super.hasNext
25 |       if (!hasNext) {
26 |         val _ = cleanup
27 |       }
28 |       hasNext
29 |     } catch {
30 |       case e: Throwable =>
31 |         val _ = cleanup
32 |         throw e
33 |     }
34 |   }
35 | 
36 |   /** @inheritdoc*/
37 |   override def next(): T = super.next()
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/MetadataIterator.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import java.sql.{Connection, DriverManager, ResultSet}
  4 | 
  5 | import org.apache.spark.internal.Logging
  6 | import org.apache.spark.sql.catalyst.expressions.Attribute
  7 | import org.apache.spark.sql.types.{ArrayType, BooleanType, TimestampType}
  8 | 
  9 | class MetadataIterator(fields: Seq[Attribute],
 10 |                        dbPath: String,
 11 |                        sql: String)
 12 |   extends Iterator[Map[String, Any]] with Logging {
 13 | 
 14 |   private val iter = new JDBCQueryIterator(fields, dbPath, sql)
 15 | 
 16 |   override def hasNext: Boolean = iter.hasNext
 17 | 
 18 |   override def next(): Map[String, Any] = {
 19 |     val values = iter.next()
 20 |     Map[String, Any](fields.zipWithIndex.map {
 21 |       case (attr, idx) if attr.dataType == BooleanType =>
 22 |         (attr.name, values(idx) match {
 23 |           case 0 => false
 24 |           case 1 => true
 25 |           case _ => null
 26 |         })
 27 |       case (attr, idx) if attr.dataType.isInstanceOf[ArrayType] =>
 28 |         (attr.name, values(idx).toString.split("\\|"))
 29 |       case (attr, idx) if attr.dataType == TimestampType =>
 30 |         (attr.name, new java.sql.Timestamp(values(idx).asInstanceOf[Long]))
 31 |       case (attr, idx) =>
 32 |         (attr.name, values(idx))
 33 |     }: _*)
 34 |   }
 35 | 
 36 |   def close(): Unit = iter.close()
 37 | 
 38 | }
 39 | 
 40 | class JDBCQueryIterator(fields: Seq[Attribute],
 41 |                         dbPath: String,
 42 |                         sql: String)
 43 |   extends Iterator[Array[Any]] with Logging {
 44 | 
 45 |   private var rs: ResultSet = _
 46 |   private var conn: Connection = _
 47 |   private var nextCollected = false
 48 |   private var hasRows = false
 49 | 
 50 |   private[iterator] def close(): Unit = {
 51 |     try {
 52 |       if (rs != null && !rs.isClosed) {
 53 |         rs.close()
 54 |       }
 55 |     } finally {
 56 |       if (conn != null && !conn.isClosed) {
 57 |         try {
 58 |           conn.close()
 59 |         } catch {
 60 |           case e: Exception => log.warn(s"could not close connection", e)
 61 |         }
 62 |       }
 63 |     }
 64 |   }
 65 | 
 66 |   override def hasNext: Boolean = {
 67 |     if (rs == null) {
 68 |       initializeResultSet()
 69 |     } else if (hasRows && !nextCollected) {
 70 |       // FIXME: RDD groupBy somehow calls hasNext twice, so we can't
 71 |       // advance the cursor until the next row has been collected to make sure
 72 |       // we don't skip rows.
 73 |       return true
 74 |     }
 75 | 
 76 |     try {
 77 |       if (!rs.isClosed && rs.next) {
 78 |         hasRows = true
 79 |         nextCollected = false
 80 |         true
 81 |       } else {
 82 |         close()
 83 |         false
 84 |       }
 85 |     } catch {
 86 |       case e: Exception =>
 87 |         log.warn(s"caught an exception in JDBCIterator.hasNext", e)
 88 |         close()
 89 |         false
 90 |     }
 91 |   }
 92 | 
 93 |   private def initializeResultSet(): Unit = {
 94 |     conn = DriverManager.getConnection(s"jdbc:sqlite:$dbPath")
 95 |     val stmt = conn.prepareStatement(sql)
 96 |     try {
 97 |       rs = stmt.executeQuery()
 98 |     } catch {
 99 |       case e: Exception =>
100 |         log.warn(s"could not execute query", e)
101 |         close()
102 |     }
103 |   }
104 | 
105 |   override def next(): Array[Any] = {
106 |     nextCollected = true
107 |     fields.zipWithIndex
108 |       .map(f => rs.getObject(f._2 + 1))
109 |       .toArray
110 |       .asInstanceOf[Array[Any]]
111 |   }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/ReferenceIterator.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import org.eclipse.jgit.lib.{ObjectId, Ref, Repository}
 4 | import tech.sourced.engine.util.{CompiledFilter, Filters}
 5 | 
 6 | import scala.collection.JavaConverters._
 7 | 
 8 | /**
 9 |   * Iterator that will return rows of references in a repository.
10 |   *
11 |   * @param finalColumns final columns that must be in the resultant row
12 |   * @param repo         repository to get the data from
13 |   * @param prevIter     previous iterator, if the iterator is chained
14 |   * @param filters      filters for the iterator
15 |   */
16 | class ReferenceIterator(finalColumns: Array[String],
17 |                         repo: Repository,
18 |                         prevIter: RepositoryIterator,
19 |                         filters: Seq[CompiledFilter],
20 |                         skipReadErrors: Boolean)
21 |   extends ChainableIterator[Ref](
22 |     finalColumns,
23 |     prevIter,
24 |     filters,
25 |     repo,
26 |     skipReadErrors
27 |   ) {
28 | 
29 |   /** @inheritdoc */
30 |   protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[Ref] =
31 |     ReferenceIterator.loadIterator(
32 |       repo,
33 |       Option(prevIter).map(_.currentRow),
34 |       Filters(filters)
35 |     )
36 | 
37 |   /** @inheritdoc */
38 |   override protected def mapColumns(ref: Ref): RawRow = {
39 |     val (repoId, refName) = RootedRepo.parseRef(repo, ref.getName)
40 |     Map[String, Any](
41 |       "repository_id" -> repoId,
42 |       "name" -> refName,
43 |       "hash" -> ObjectId.toString(Option(ref.getPeeledObjectId).getOrElse(ref.getObjectId)),
44 |       "is_remote" -> RootedRepo.isRemote(repo, ref.getName)
45 |     )
46 |   }
47 | }
48 | 
49 | object ReferenceIterator {
50 | 
51 |   /**
52 |     * Returns an iterator of references.
53 |     *
54 |     * @param repo       repository to get the data from
55 |     * @param filters    filters to skip some rows. The only supported filters at the iterator
56 |     *                   level are by repository id and by reference name. The keys of said filters
57 |     *                   are controlled by the parameters `repoKey` and `refNameKey`.
58 |     * @param repoKey    name of the repository id filter key
59 |     * @param refNameKey name of the reference name filter key
60 |     * @return the iterator
61 |     */
62 |   def loadIterator(repo: Repository,
63 |                    repoId: Option[String],
64 |                    filters: Filters,
65 |                    repoKey: String = "repository_id",
66 |                    refNameKey: String = "name"): Iterator[Ref] = {
67 |     val repoKeys = Seq(repoKey)
68 |     val repoIds: Array[String] = repoId match {
69 |       case Some(id) =>
70 |         if (!filters.hasFilters(repoKeys: _*) || filters.matches(repoKeys, id)) {
71 |           Array(id)
72 |         } else {
73 |           Array()
74 |         }
75 |       case None =>
76 |         RepositoryIterator.loadIterator(repo, filters, repoKey).toArray
77 |     }
78 | 
79 |     val refNameKeys = Seq("name", refNameKey)
80 |     val hasRefFilters = filters.hasFilters(refNameKeys: _*)
81 |     val out = repo.getAllRefs.asScala.values.filter(ref => {
82 |       val (repoId, refName) = RootedRepo.parseRef(repo, ref.getName)
83 |       (repoIds.isEmpty || repoIds.contains(repoId)) &&
84 |         (!hasRefFilters || filters.matches(refNameKeys, refName))
85 |     })
86 | 
87 |     out.toIterator
88 |   }
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/RepositoryIterator.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import org.eclipse.jgit.lib.Repository
 4 | import tech.sourced.engine.util.{CompiledFilter, Filters}
 5 | 
 6 | /**
 7 |   * Iterator that will return rows of repositories in a repository.
 8 |   *
 9 |   * @param repositoryPath path of the given repository
10 |   * @param finalColumns   final columns that must be in the resultant row
11 |   * @param repo           repository to get the data from
12 |   * @param filters        filters for the iterator
13 |   */
14 | class RepositoryIterator(repositoryPath: String,
15 |                          finalColumns: Array[String],
16 |                          repo: Repository,
17 |                          filters: Seq[CompiledFilter],
18 |                          skipReadErrors: Boolean)
19 |   extends ChainableIterator[String](
20 |     finalColumns,
21 |     null,
22 |     filters,
23 |     repo,
24 |     skipReadErrors
25 |   ) {
26 | 
27 |   // since this iterator does not override getFilters method of RootedRepository
28 |   // we can cache here the matching cases, because they are not going to change.
29 |   private val matchingFilters = Filters(filters)
30 | 
31 |   /** @inheritdoc*/
32 |   override protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[String] =
33 |     RepositoryIterator.loadIterator(repo, matchingFilters)
34 | 
35 |   /** @inheritdoc*/
36 |   override protected def mapColumns(id: String): RawRow = {
37 |     val c = repo.getConfig
38 |     val remote = RootedRepo.getRepositoryRemote(repo, id)
39 |     val urls = remote.map(r => c.getStringList("remote", r, "url"))
40 |       .orElse(Some(Array[String]())).get
41 |     val isFork = remote.map(r => c.getBoolean("remote", r, "isfork", false))
42 |       .orElse(Some(false)).get
43 | 
44 |     Map[String, Any](
45 |       "id" -> id,
46 |       "urls" -> urls,
47 |       "is_fork" -> isFork,
48 |       "repository_path" -> repositoryPath
49 |     )
50 |   }
51 | }
52 | 
53 | object RepositoryIterator {
54 | 
55 |   import scala.collection.JavaConverters._
56 | 
57 |   /**
58 |     * Returns an iterator of references.
59 |     *
60 |     * @param repo    repository to get the data from
61 |     * @param filters filters to skip some rows. The only supported filters at the iterator
62 |     *                level are by repository id. The key of said filters
63 |     *                are controlled by the parameter `repoKey`.
64 |     * @param repoKey name of the repository id filter key
65 |     * @return the iterator
66 |     */
67 |   def loadIterator(repo: Repository,
68 |                    filters: Filters,
69 |                    repoKey: String = "id"): Iterator[String] = {
70 |     // If there's any non-remote reference, it will show up here, thus
71 |     // making the local repository appear. If we only take into account
72 |     // the remotes the result will be different from the one returned by
73 |     // the reference iterator.
74 |     // This makes us process this twice in a chained reference iterator
75 |     // scenario, even though the result would be correct without this,
76 |     // but it's needed for correctness when the table is asked independently.
77 |     val refRepos = repo.getAllRefs.asScala.keys
78 |       .map(ref => RootedRepo.parseRef(repo, ref)._1)
79 | 
80 |     val repos = repo.getConfig.getSubsections("remote").asScala.toIterator
81 |       .map(RootedRepo.getRepositoryId(repo, _).get) ++ refRepos
82 | 
83 |     val iter = repos.toList.distinct.toIterator
84 | 
85 |     val filterKeys = Seq("id", repoKey)
86 |     if (filters.hasFilters(filterKeys: _*)) {
87 |       iter.filter(filters.matches(filterKeys, _))
88 |     } else {
89 |       iter
90 |     }
91 |   }
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/iterator/RootedRepo.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import org.eclipse.jgit.lib.{Repository, StoredConfig}
  4 | import tech.sourced.engine.util.GitUrlsParser
  5 | import scala.collection.JavaConverters.collectionAsScalaIterableConverter
  6 | 
  7 | object RootedRepo {
  8 | 
  9 |   /**
 10 |     * Returns the ID of a repository given its remote name.
 11 |     *
 12 |     * @param repo       repository
 13 |     * @param remoteName remote name
 14 |     * @return repository ID
 15 |     */
 16 |   private[iterator] def getRepositoryId(repo: Repository, remoteName: String): Option[String] = {
 17 |     // TODO: maybe a cache here could improve performance
 18 |     val c: StoredConfig = repo.getConfig
 19 |     c.getSubsections("remote").asScala.find(_ == remoteName) match {
 20 |       case None => None
 21 |       case Some(name) => Some(GitUrlsParser.getIdFromUrls(
 22 |         c.getStringList("remote", name, "url")
 23 |       ))
 24 |     }
 25 |   }
 26 | 
 27 |   /**
 28 |     * Returns the remote name of a repository with the given ID.
 29 |     *
 30 |     * @param repo repository
 31 |     * @param id   repository id
 32 |     * @return remote name
 33 |     */
 34 |   private[iterator] def getRepositoryRemote(repo: Repository, id: String): Option[String] = {
 35 |     // TODO: maybe a cache here could improve performance
 36 |     val c: StoredConfig = repo.getConfig
 37 |     c.getSubsections("remote").asScala.find(remoteName => {
 38 |       val actualId: String =
 39 |         GitUrlsParser.getIdFromUrls(c.getStringList("remote", remoteName, "url"))
 40 | 
 41 |       actualId == id
 42 |     })
 43 |   }
 44 | 
 45 |   /**
 46 |     * Parses a reference name and returns a tuple with the repository id and the reference name.
 47 |     *
 48 |     * @param repo repository
 49 |     * @param ref  reference name
 50 |     * @return tuple with repository id and reference name
 51 |     */
 52 |   private[iterator] def parseRef(repo: Repository, ref: String): (String, String) = {
 53 |     val split: Array[String] = ref.split("/")
 54 |     val uuid: String = split.last
 55 | 
 56 |     // if it's a siva file, the last part will be the uuid of the repository, which
 57 |     // is the name of the remote associated to that particular repository
 58 |     getRepositoryId(repo, uuid) match {
 59 |       case Some(repoId) =>
 60 |         val refName: String = split.init.mkString("/")
 61 | 
 62 |         (repoId, refName)
 63 | 
 64 |       // If no uuid matches, it means this is not a siva file, so we should find this
 65 |       // using the whole reference name
 66 |       case None =>
 67 |         val c: StoredConfig = repo.getConfig
 68 |         val refRemote = repo.getRemoteName(ref)
 69 |         val repoId = c.getSubsections("remote").asScala
 70 |           .find(_ == refRemote)
 71 |           .map(r => GitUrlsParser.getIdFromUrls(c.getStringList("remote", r, "url")))
 72 |           .orNull
 73 | 
 74 |         if (repoId == null) {
 75 |           // if branch is local, use the repo path as directory
 76 |           // since there's no way to tell to which remote it belongs (probably none)
 77 |           val repoPath = if (repo.getDirectory.toPath.getFileName.toString == ".git") {
 78 |             // non-bare repositories will have the .git directory as their directory
 79 |             // so we'll use the parent
 80 |             repo.getDirectory.toPath.getParent
 81 |           } else {
 82 |             repo.getDirectory.toPath
 83 |           }
 84 | 
 85 |           ("file://" + repoPath, ref)
 86 |         } else {
 87 |           (repoId, ref.replace(s"refs/remotes/$refRemote", "refs/heads"))
 88 |         }
 89 |     }
 90 |   }
 91 | 
 92 |   private[iterator] def isRemote(repo: Repository, ref: String): Boolean = {
 93 |     val split: Array[String] = ref.split("/")
 94 |     val uuid: String = split.last
 95 | 
 96 |     // if it's a siva file, the last part will be the uuid of the repository, which
 97 |     // is the name of the remote associated to that particular repository
 98 |     getRepositoryId(repo, uuid) match {
 99 |       case Some(_) =>
100 |         true // is a siva file
101 | 
102 |       // If no uuid matches, it means this is not a siva file, so we should find this
103 |       // using the whole reference name
104 |       case None =>
105 |         Option(repo.getRemoteName(ref)).isDefined
106 |     }
107 |   }
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/provider/ReadOnlyFileRepository.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.provider
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.eclipse.jgit.internal.storage.file.FileRepository
 6 | import org.eclipse.jgit.storage.file.FileBasedConfig
 7 | 
 8 | /**
 9 |   * [[FileRepository]] implementation for read-only repositories.
10 |   *
11 |   * Some operations are performance optimized for this case. If the underlying repository changes,
12 |   * usage of this repository implementation might lead to unexpected results.
13 |   *
14 |   * @param gitDir Path to the git directory.
15 |   */
16 | private[provider] class ReadOnlyFileRepository(gitDir: File) extends FileRepository(gitDir) {
17 | 
18 |   /** @inheritdoc */
19 |   override lazy val getConfig: FileBasedConfig = {
20 |     //XXX: repoConfig is initialized in FileRepository's constructor.
21 |     //     Here we always return it without checking for changes in the underlying
22 |     //     filesystem. This prevents checking for last modification date of configuration
23 |     //     files on every operation.
24 |     val accessor = classOf[FileRepository].getDeclaredField("repoConfig")
25 |     accessor.setAccessible(true)
26 |     accessor.get(this).asInstanceOf[FileBasedConfig]
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/provider/RepositoryRDDProvider.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.provider
  2 | 
  3 | import java.util.concurrent.ConcurrentHashMap
  4 | 
  5 | import org.apache.hadoop.fs.Path
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.input.PortableDataStream
  8 | import org.apache.spark.rdd.RDD
  9 | 
 10 | import scala.collection.concurrent
 11 | import scala.collection.convert.decorateAsScala._
 12 | 
 13 | /**
 14 |   * Provides an RDD of repositories in the following forms:
 15 |   * - siva files
 16 |   * - bare repositories
 17 |   * - regular git repositories
 18 |   *
 19 |   * @param sc Spark Context
 20 |   */
 21 | class RepositoryRDDProvider(sc: SparkContext) {
 22 |   private val rdd: concurrent.Map[String, RDD[RepositorySource]] =
 23 |     new ConcurrentHashMap[String, RDD[RepositorySource]]().asScala
 24 | 
 25 |   /**
 26 |     * Generates an RDD of repositories with their source at the given path.
 27 |     * Path may be remote or local.
 28 |     *
 29 |     * @param path               Path where the repositories are stored.
 30 |     * @param repositoriesFormat Format of the repositories that are inside the provided path
 31 |     * @return RDD of repositories
 32 |     */
 33 |   def get(path: String, repositoriesFormat: String): RDD[RepositorySource] =
 34 |     rdd.getOrElse(path, RepositoryRDDProvider.generateRDD(sc, path, repositoriesFormat))
 35 | }
 36 | 
 37 | /**
 38 |   * Provides some utility methods for [[RepositoryRDDProvider]] class.
 39 |   * Acts as a singleton for getting an unique instance of [[RepositoryRDDProvider]]s, so the
 40 |   * recommended way of using said class is using this companion object.
 41 |   */
 42 | object RepositoryRDDProvider {
 43 |   val SivaFormat: String = "siva"
 44 |   val BareFormat: String = "bare"
 45 |   val StandardFormat: String = "standard"
 46 | 
 47 |   /** The singleton Siva RDD provider. */
 48 |   var provider: RepositoryRDDProvider = _
 49 | 
 50 |   /**
 51 |     * Returns the provider instance and creates one if none has been created yet.
 52 |     *
 53 |     * @param sc Spark Context
 54 |     * @return RepositorySource RDD provider
 55 |     */
 56 |   def apply(sc: SparkContext): RepositoryRDDProvider = {
 57 |     Option(provider).getOrElse({
 58 |       provider = new RepositoryRDDProvider(sc)
 59 |       provider
 60 |     })
 61 |   }
 62 | 
 63 |   /**
 64 |     * Generates an RDD of [[RepositorySource]] with the repositories at the given path.
 65 |     * Allows bucketing of siva files and raw repositories.
 66 |     *
 67 |     * @param sc                 Spark Context
 68 |     * @param path               path to get the repositories from
 69 |     * @param repositoriesFormat format of the repositories inside the provided path
 70 |     * @return generated RDD
 71 |     */
 72 |   private def generateRDD(sc: SparkContext,
 73 |                           path: String,
 74 |                           repositoriesFormat: String): RDD[RepositorySource] = {
 75 |     repositoriesFormat match {
 76 |       case SivaFormat =>
 77 |         sc.binaryFiles(s"$path/*").flatMap(b => if (b._1.endsWith(".siva")) {
 78 |           Some(SivaRepository(b._2))
 79 |         } else {
 80 |           None
 81 |         })
 82 |       case StandardFormat | BareFormat =>
 83 |         sc.binaryFiles(s"$path/**/*").map {
 84 |           case (path: String, pds: PortableDataStream) =>
 85 |             // returns a tuple of the root directory where it is contained, with a maximum depth
 86 |             // of 1 under the given path, the file name, and the portable data stream
 87 |             val idx = path.indexOf('/', path.length + 1)
 88 |             if (idx < 0) {
 89 |               val p = new Path(path)
 90 |               (p.getParent.toString, (p.getName, pds))
 91 |             } else {
 92 |               val (parent, file) = path.splitAt(idx)
 93 |               (parent, (file, pds))
 94 |             }
 95 |         }.groupByKey()
 96 |           .map {
 97 |             case (dir, files) =>
 98 |               if (repositoriesFormat == StandardFormat) {
 99 |                 GitRepository(dir, files.head._2)
100 |               } else {
101 |                 BareRepository(dir, files.head._2)
102 |               }
103 |           }
104 |       case other => throw new RuntimeException(s"Repository format $other is not supported")
105 |     }
106 |   }
107 | 
108 | }
109 | 
110 | /**
111 |   * RepositorySource is a repository that comes from a certain source.
112 |   */
113 | sealed trait RepositorySource extends Serializable {
114 |   /**
115 |     * Returns the portable data stream of one of the repository files. In the case
116 |     * of siva files, of the siva file itself.
117 |     *
118 |     * @return portable data stream
119 |     */
120 |   def pds: PortableDataStream
121 | 
122 |   /**
123 |     * Returns the path to the root of the repository. In the case of siva files, the
124 |     * path to the siva file itself.
125 |     *
126 |     * @return path to the repository root
127 |     */
128 |   def root: String
129 | }
130 | 
131 | /**
132 |   * Repository coming from a siva file.
133 |   *
134 |   * @param pds portable data stream of the siva file
135 |   */
136 | case class SivaRepository(pds: PortableDataStream) extends RepositorySource {
137 |   def root: String = pds.getPath
138 | }
139 | 
140 | /**
141 |   * Repository coming from a bare repository.
142 |   *
143 |   * @param root root of the repository
144 |   * @param pds  portable data stream of any repository file (should only be used to
145 |   *             retrieve the HDFS config)
146 |   */
147 | case class BareRepository(root: String, pds: PortableDataStream) extends RepositorySource
148 | 
149 | /**
150 |   * Repository coming from a regular repository with a .git directory.
151 |   *
152 |   * @param root root of the repository
153 |   * @param pds  portable data stream of any repository file (should only be used to
154 |   *             retrieve the HDFS config)
155 |   */
156 | case class GitRepository(root: String, pds: PortableDataStream) extends RepositorySource
157 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/rule/AddSourceToAttributes.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.rule
 2 | 
 3 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
 4 | import org.apache.spark.sql.catalyst.expressions.AttributeReference
 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 6 | import org.apache.spark.sql.catalyst.rules.Rule
 7 | import org.apache.spark.sql.execution.datasources.LogicalRelation
 8 | import org.apache.spark.sql.sources.BaseRelation
 9 | import org.apache.spark.sql.types.MetadataBuilder
10 | import tech.sourced.engine.{GitRelation, MetadataRelation, Sources}
11 | import tech.sourced.engine.compat
12 | 
13 | /**
14 |   * Rule to assign to an [[AttributeReference]] metadata to identify the table it belongs to.
15 |   */
16 | object AddSourceToAttributes extends Rule[LogicalPlan] {
17 | 
18 |   /**
19 |     * SOURCE is the key used for attach metadata to [[AttributeReference]]s.
20 |     */
21 |   private val SOURCE = Sources.SourceKey
22 | 
23 |   /** @inheritdoc */
24 |   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
25 |     case compat.LogicalRelation(rel @ GitRelation(_, _, _, schemaSource),
26 |                                 out,
27 |                                 catalogTable) =>
28 |       withMetadata(rel, schemaSource, out, catalogTable)
29 | 
30 |     case compat.LogicalRelation(
31 |         rel @ MetadataRelation(_, _, _, _, schemaSource),
32 |         out,
33 |         catalogTable) =>
34 |       withMetadata(rel, schemaSource, out, catalogTable)
35 |   }
36 | 
37 |   private def withMetadata(relation: BaseRelation,
38 |                            schemaSource: Option[String],
39 |                            out: Seq[AttributeReference],
40 |                            catalogTable: Option[CatalogTable]): LogicalRelation = {
41 |     val processedOut = schemaSource match {
42 |       case Some(table) => out.map(
43 |         _.withMetadata(new MetadataBuilder().putString(SOURCE, table).build()
44 |         ).asInstanceOf[AttributeReference]
45 |       )
46 |       case None => out
47 |     }
48 | 
49 |     compat.LogicalRelation(relation, processedOut, catalogTable)
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/rule/RelationOptimizer.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.rule
  2 | 
  3 | import org.apache.spark.internal.Logging
  4 | import org.apache.spark.sql.catalyst.expressions._
  5 | import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
  6 | import org.apache.spark.sql.catalyst.plans.logical.Join
  7 | import org.apache.spark.sql.execution.datasources.LogicalRelation
  8 | import org.apache.spark.sql.types.{StructField, StructType}
  9 | import tech.sourced.engine.Sources
 10 | 
 11 | private[rule] object RelationOptimizer extends Logging {
 12 |   private val supportedJoinTypes: Seq[JoinType] = Inner :: Nil
 13 | 
 14 |   /**
 15 |     * Reports whether the given join is supported.
 16 |     *
 17 |     * @param j join
 18 |     * @return is supported or not
 19 |     */
 20 |   def isJoinSupported(j: Join): Boolean = supportedJoinTypes.contains(j.joinType)
 21 | 
 22 |   /**
 23 |     * Retrieves all the unsupported conditions in the join.
 24 |     *
 25 |     * @param join  Join
 26 |     * @param left  left relation
 27 |     * @param right right relation
 28 |     * @return unsupported conditions
 29 |     */
 30 |   def getUnsupportedConditions(join: Join,
 31 |                                left: LogicalRelation,
 32 |                                right: LogicalRelation): Set[_] = {
 33 |     val leftReferences = left.references.baseSet
 34 |     val rightReferences = right.references.baseSet
 35 |     val joinReferences = join.references.baseSet
 36 |     joinReferences -- leftReferences -- rightReferences
 37 |   }
 38 | 
 39 |   /**
 40 |     * Mixes the two given expressions with the given join function if both exist
 41 |     * or returns the one that exists otherwise.
 42 |     *
 43 |     * @param l            left expression
 44 |     * @param r            right expression
 45 |     * @param joinFunction function used to join them
 46 |     * @return an optional expression
 47 |     */
 48 |   def mixExpressions(l: Option[Expression],
 49 |                      r: Option[Expression],
 50 |                      joinFunction: (Expression, Expression) => Expression):
 51 |   Option[Expression] = {
 52 |     (l, r) match {
 53 |       case (Some(expr1), Some(expr2)) => Some(joinFunction(expr1, expr2))
 54 |       case (None, None) => None
 55 |       case (le, None) => le
 56 |       case (None, re) => re
 57 |     }
 58 |   }
 59 | 
 60 |   /**
 61 |     * Creates a schema from a list of attributes.
 62 |     *
 63 |     * @param attributes list of attributes
 64 |     * @return resultant schema
 65 |     */
 66 |   def attributesToSchema(attributes: Seq[AttributeReference]): StructType =
 67 |     StructType(
 68 |       attributes
 69 |         .map((a: Attribute) => StructField(a.name, a.dataType, a.nullable, a.metadata))
 70 |         .toArray
 71 |     )
 72 | 
 73 |   /**
 74 |     * Takes the join conditions, if any, and transforms them to filters, by removing some filters
 75 |     * that don't make sense because they are already done inside the iterator.
 76 |     *
 77 |     * @param expr optional condition to transform
 78 |     * @return transformed join conditions or none
 79 |     */
 80 |   def joinConditionsToFilters(expr: Option[Expression]): Option[Expression] = expr match {
 81 |     case Some(e) =>
 82 |       e transformUp {
 83 |         case Equality(
 84 |         a: AttributeReference,
 85 |         b: AttributeReference
 86 |         ) if isRedundantAttributeFilter(a, b) =>
 87 |           EqualTo(Literal(1), Literal(1))
 88 | 
 89 |         case BinaryOperator(a, Equality(IntegerLiteral(1), IntegerLiteral(1))) =>
 90 |           a
 91 | 
 92 |         case BinaryOperator(Equality(IntegerLiteral(1), IntegerLiteral(1)), b) =>
 93 |           b
 94 |       } match {
 95 |         case Equality(IntegerLiteral(1), IntegerLiteral(1)) =>
 96 |           None
 97 |         case finalExpr =>
 98 |           Some(finalExpr)
 99 |       }
100 |     case None => None
101 |   }
102 | 
103 |   /**
104 |     * Returns whether the equality between the two given attribute references is redundant
105 |     * for a filter (because they are taken care of inside the iterators).
106 |     *
107 |     * @param a left attribute
108 |     * @param b right attribute
109 |     * @return is redundant or not
110 |     */
111 |   def isRedundantAttributeFilter(a: AttributeReference, b: AttributeReference): Boolean = {
112 |     // to avoid case (a, b) and case (b, a) we take left and right sorted by name and source
113 |     val (left, right) = a.name.compareTo(b.name) match {
114 |       case 0 =>
115 |         val sourceA = attributeSource(a).getOrElse("")
116 |         val sourceB = attributeSource(b).getOrElse("")
117 |         if (sourceA.compareTo(sourceB) <= 0) (a, b) else (b, a)
118 |       case n if n < 0 => (a, b)
119 |       case _ => (b, a)
120 |     }
121 | 
122 |     (attributeQualifiedName(left), attributeQualifiedName(right)) match {
123 |       case (("repositories", "id"), ("references", "repository_id")) => true
124 |       case (("references", "name"), ("commits", "reference_name")) => true
125 |       case (("tree_entries", "commit_hash"), ("commits", "hash")) => true
126 |       case (("tree_entries", "blob"), ("blobs", "blob_id")) => true
127 |       // source does not matter in these cases
128 |       case ((_, "repository_id"), (_, "repository_id")) => true
129 |       case ((_, "reference_name"), (_, "reference_name")) => true
130 |       case ((_, "commit_hash"), (_, "commit_hash")) => true
131 |       case _ => false
132 |     }
133 |   }
134 | 
135 |   def attributeSource(a: AttributeReference): Option[String] =
136 |     if (a.metadata.contains(Sources.SourceKey)) {
137 |       Some(a.metadata.getString(Sources.SourceKey))
138 |     } else {
139 |       None
140 |     }
141 | 
142 |   def attributeQualifiedName(a: AttributeReference): (String, String) =
143 |     (attributeSource(a).getOrElse(""), a.name)
144 | 
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/ClassifyLanguagesUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import org.apache.spark.internal.Logging
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.expressions.UserDefinedFunction
 6 | import org.apache.spark.sql.functions.udf
 7 | import tech.sourced.enry.Enry
 8 | 
 9 | /** User defined function to guess languages of files. */
10 | case object ClassifyLanguagesUDF extends CustomUDF with Logging {
11 | 
12 |   override val name = "classifyLanguages"
13 | 
14 |   override def apply(session: SparkSession): UserDefinedFunction =
15 |     udf[Option[String], Boolean, String, Array[Byte]](getLanguage)
16 | 
17 |   /**
18 |     * Gets the language of the given file and returns the guessed language or none.
19 |     *
20 |     * @param isBinary whether it's a binary file or not
21 |     * @param path     file path
22 |     * @param content  file content
23 |     * @return `None` if no language could be guessed, `Some(language)` otherwise.
24 |     */
25 |   def getLanguage(isBinary: Boolean, path: String, content: Array[Byte]): Option[String] = {
26 |     timer.time({
27 |       if (isBinary) {
28 |         None
29 |       } else {
30 |         val lang = try {
31 |           Enry.getLanguage(path, content)
32 |         } catch {
33 |           case e@(_: RuntimeException | _: Exception) =>
34 |             log.error(s"get language for file '$path' failed", e)
35 |             null
36 |         }
37 |         if (null == lang || lang.isEmpty) None else Some(lang)
38 |       }
39 |     })
40 |   }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/ConcatArrayUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.expressions.UserDefinedFunction
 5 | import org.apache.spark.sql.functions.udf
 6 | 
 7 | 
 8 | /** User defined function to concat array elements with the given separator. */
 9 | case object ConcatArrayUDF extends CustomUDF {
10 | 
11 |   override val name = "concatArray"
12 | 
13 |   override def apply(session: SparkSession): UserDefinedFunction = {
14 |     udf[String, Seq[String], String]((arr, sep) => arr.mkString(sep))
15 |   }
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/CustomUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import org.apache.spark.groupon.metrics.{NotInitializedException, SparkTimer, UserMetricsSystem}
 4 | import org.apache.spark.internal.Logging
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.expressions.UserDefinedFunction
 7 | 
 8 | /**
 9 |   * Custom named user defined function.
10 |   */
11 | abstract class CustomUDF {
12 |   /** timer intended to be used on UDF logic **/
13 |   lazy protected val timer: SparkTimerUDFWrapper = new SparkTimerUDFWrapper(name)
14 | 
15 |   /** Name of the function. */
16 |   val name: String
17 | 
18 |   /** Function to execute when this function is called. */
19 |   def apply(session: SparkSession): UserDefinedFunction
20 | 
21 |   def apply(): UserDefinedFunction = this.apply(session = null)
22 | }
23 | 
24 | sealed class SparkTimerUDFWrapper(name: String) extends Logging {
25 |   lazy val timer: SparkTimer = init()
26 | 
27 |   private def init(): SparkTimer = {
28 |     try {
29 |       UserMetricsSystem.timer(name)
30 |     } catch {
31 |       case _: NotInitializedException => {
32 |         logWarning("SparkMetric not initialized on UDF")
33 |         null
34 |       }
35 |     }
36 | 
37 |   }
38 | 
39 |   def time[T](f: => T): T =
40 |     if (timer == null) {
41 |       f
42 |     } else {
43 |       timer.time(f)
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/ExtractTokensUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.expressions.UserDefinedFunction
 6 | import org.apache.spark.sql.functions.udf
 7 | 
 8 | /** User defined function to extract tokens from an UAST. */
 9 | case object ExtractTokensUDF extends CustomUDF {
10 | 
11 |   override val name = "extractTokens"
12 | 
13 |   override def apply(session: SparkSession): UserDefinedFunction =
14 |     udf[Seq[String], Seq[Array[Byte]]](extractTokens)
15 | 
16 |   private def extractTokens(nodes: Seq[Array[Byte]]): Seq[String] = {
17 |     timer.time({
18 |       if (nodes == null) {
19 |         Seq()
20 |       } else {
21 |         nodes.map(Node.parseFrom).map(_.token)
22 |       }
23 |     })
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/ExtractUASTsUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.expressions.UserDefinedFunction
 5 | import org.apache.spark.sql.functions.udf
 6 | import tech.sourced.engine.util.Bblfsh
 7 | 
 8 | trait ExtractUASTsUDF {
 9 | 
10 |   def extractUASTs(path: String,
11 |                    content: Array[Byte],
12 |                    lang: String = null,
13 |                    config: Bblfsh.Config): Seq[Array[Byte]] = {
14 |     if (content == null || content.isEmpty) {
15 |       Seq()
16 |     } else {
17 |       Bblfsh.extractUAST(path, content, lang, config)
18 |     }
19 |   }
20 | 
21 | }
22 | 
23 | /** Common entry point to use extraction UAST UDFs with or without language parameter. */
24 | case object ExtractUASTsUDF extends CustomUDF with ExtractUASTsUDF {
25 | 
26 |   override val name = "extractUASTs"
27 | 
28 |   override def apply(session: SparkSession): UserDefinedFunction = {
29 |     val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session))
30 |     udf[Seq[Array[Byte]], String, Array[Byte], String]((path, content, lang) =>
31 |       extractUASTs(path, content, lang, configB.value))
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/udf/QueryXPathUDF.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.udf
 2 | 
 3 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.expressions.UserDefinedFunction
 6 | import org.apache.spark.sql.functions.udf
 7 | import tech.sourced.engine.util.Bblfsh
 8 | 
 9 | 
10 | /** User defined function to perform XPath queries on UASTs. */
11 | case object QueryXPathUDF extends CustomUDF {
12 | 
13 |   override val name = "queryXPath"
14 | 
15 |   override def apply(session: SparkSession): UserDefinedFunction = {
16 |     val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session))
17 |     udf[Seq[Array[Byte]], Seq[Array[Byte]], String]((nodes, query) =>
18 |       queryXPath(nodes, query, configB.value))
19 |   }
20 | 
21 |   private def queryXPath(nodes: Seq[Array[Byte]],
22 |                          query: String,
23 |                          config: Bblfsh.Config): Seq[Array[Byte]] = {
24 |     timer.time({
25 |       if (nodes == null) {
26 |         return null
27 |       }
28 | 
29 |       nodes.map(Node.parseFrom).flatMap(n => {
30 |         val result = Bblfsh.filter(n, query, config)
31 |         if (result == null) {
32 |           None
33 |         } else {
34 |           result.toIterator
35 |         }
36 |       }).map(_.toByteArray)
37 |     })
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/util/Bblfsh.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.util
  2 | 
  3 | import java.nio.charset.StandardCharsets
  4 | 
  5 | import gopkg.in.bblfsh.sdk.v1.protocol.generated.Status
  6 | import gopkg.in.bblfsh.sdk.v1.uast.generated.Node
  7 | import org.apache.spark.internal.Logging
  8 | import org.apache.spark.sql.SparkSession
  9 | import org.bblfsh.client.BblfshClient
 10 | import tech.sourced.engine.util.Bblfsh.client
 11 | 
 12 | object Bblfsh extends Logging {
 13 | 
 14 |   case class Config(host: String, port: Int)
 15 | 
 16 |   /** Key used for the option to specify the host of the bblfsh grpc service. */
 17 |   val hostKey = "spark.tech.sourced.bblfsh.grpc.host"
 18 | 
 19 |   /** Key used for the option to specify the port of the bblfsh grpc service. */
 20 |   val portKey = "spark.tech.sourced.bblfsh.grpc.port"
 21 | 
 22 |   /** Default bblfsh host. */
 23 |   val defaultHost = "0.0.0.0"
 24 | 
 25 |   /** Default bblfsh port. */
 26 |   val defaultPort = 9432
 27 | 
 28 |   var supportedLanguages: Set[String] = Set()
 29 | 
 30 |   private var config: Config = _
 31 |   private var client: BblfshClient = _
 32 | 
 33 |   /**
 34 |     * Returns the configuration for bblfsh.
 35 |     *
 36 |     * @param session Spark session
 37 |     * @return bblfsh configuration
 38 |     */
 39 |   def getConfig(session: SparkSession): Config = {
 40 |     if (config == null) {
 41 |       val host = session.conf.get(hostKey, Bblfsh.defaultHost)
 42 |       val port = session.conf.get(portKey, Bblfsh.defaultPort.toString).toInt
 43 |       config = Config(host, port)
 44 |     }
 45 | 
 46 |     config
 47 |   }
 48 | 
 49 |   private def getClient(config: Config): BblfshClient = synchronized {
 50 |     if (client == null) {
 51 |       client = BblfshClient(config.host, config.port)
 52 |     }
 53 | 
 54 |     client
 55 |   }
 56 | 
 57 |   private def getSupportedLanguages(config: Config): Set[String] = synchronized {
 58 |     if (supportedLanguages.isEmpty) {
 59 |       val client = getClient(config)
 60 |       supportedLanguages = client.supportedLanguages()
 61 |         .languages.map(m => m.language)
 62 |         .toSet
 63 |     }
 64 | 
 65 |     supportedLanguages
 66 |   }
 67 | 
 68 |   private def shouldExtractLanguage(config: Config, lang: String): Boolean = {
 69 |     val supportedLanguages = getSupportedLanguages(config)
 70 |     supportedLanguages.contains(lang.toLowerCase())
 71 |   }
 72 | 
 73 |   /**
 74 |     * Extracts the UAST using bblfsh.
 75 |     *
 76 |     * @param path    File path
 77 |     * @param content File content
 78 |     * @param lang    File language
 79 |     * @param config bblfsh configuration
 80 |     * @return List of uast nodes binary-encoded as a byte array
 81 |     */
 82 |   def extractUAST(path: String,
 83 |                   content: Array[Byte],
 84 |                   lang: String,
 85 |                   config: Config): Seq[Array[Byte]] = {
 86 |     //FIXME(bzz): not everything is UTF-8 encoded :/
 87 |     // if lang == null, it hasn't been classified yet
 88 |     // so rely on bblfsh to guess this file's language
 89 |     if (lang != null && !shouldExtractLanguage(config, lang)) {
 90 |       Seq()
 91 |     } else {
 92 |       val client = getClient(config)
 93 |       val contentStr = new String(content, StandardCharsets.UTF_8)
 94 |       val parsed = client.parse(path, content = contentStr, lang = lang)
 95 |       if (parsed.status == Status.OK) {
 96 |         Seq(parsed.uast.get.toByteArray)
 97 |       } else {
 98 |         logWarning(s"${parsed.status} $path: ${parsed.errors.mkString("; ")}")
 99 |         Seq()
100 |       }
101 |     }
102 |   }
103 | 
104 |   /**
105 |     * Filter an UAST node using the given query.
106 |     *
107 |     * @param node An UAST node
108 |     * @param query XPath expression
109 |     * @param config bblfsh configuration
110 |     * @return UAST list of filtered nodes
111 |     */
112 |   def filter(node: Node, query: String, config: Config): List[Node] = {
113 |     getClient(config).filter(node, query)
114 |   }
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/util/GitUrlsParser.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.util
 2 | 
 3 | import java.net.{URI, URISyntaxException}
 4 | 
 5 | object GitUrlsParser {
 6 |   private val isGit = """(.+)\@(.+):(.+)\.git""".r
 7 | 
 8 |   /**
 9 |     * Retrieves the URL that will act as identifier in a list of URLs
10 |     * for a repository.
11 |     *
12 |     * @param urls array of urls
13 |     * @return processed id
14 |     */
15 |   def getIdFromUrls(urls: Array[String]): String = {
16 |     urls.flatMap({
17 |       case isGit(_, host, path, _*) =>
18 |         Some(s"$host/$path")
19 |       case s => try {
20 |         val u: URI = new URI(s)
21 |         Some(u.getHost + u.getPath)
22 |       } catch {
23 |         case _: URISyntaxException => None
24 |       }
25 |     }).distinct.min
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/tech/sourced/engine/util/MD5Gen.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.util
 2 | 
 3 | import java.security.MessageDigest
 4 | import javax.xml.bind.annotation.adapters.HexBinaryAdapter
 5 | 
 6 | /**
 7 |   * Convenience wrapper around java [[java.security.MessageDigest]] for easier md5 hashing.
 8 |   */
 9 | object MD5Gen {
10 |   private val ba = new HexBinaryAdapter()
11 | 
12 |   /**
13 |     * Hashes the given string using md5.
14 |     *
15 |     * @param s string to hash
16 |     * @return hashed string
17 |     */
18 |   def str(s: String): String = synchronized {
19 |     ba.marshal(MessageDigest.getInstance("MD5").digest(s.getBytes()))
20 |   }
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/src/test/resources/bad-siva-files/0a0bfaa46954437548fbaeb0e19237f84e968511.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/bad-siva-files/0a0bfaa46954437548fbaeb0e19237f84e968511.siva


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Define the root logger with appender
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=ERROR
10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.tech.sourced.engine.iterator.BlobIterator=ERROR
12 | log4j.logger.tech.sourced.engine.provider.RepositoryProvider=ERROR
13 | 


--------------------------------------------------------------------------------
/src/test/resources/siva-files/05893125684f2d3943cd84a7ab2b75e53668fba1.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/05893125684f2d3943cd84a7ab2b75e53668fba1.siva


--------------------------------------------------------------------------------
/src/test/resources/siva-files/ff/fff840f8784ef162dc83a1465fc5763d890b68ba.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/ff/fff840f8784ef162dc83a1465fc5763d890b68ba.siva


--------------------------------------------------------------------------------
/src/test/resources/siva-files/fff7062de8474d10a67d417ccea87ba6f58ca81d.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/fff7062de8474d10a67d417ccea87ba6f58ca81d.siva


--------------------------------------------------------------------------------
/src/test/resources/siva-files/not-siva.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/siva-files/not-siva.txt


--------------------------------------------------------------------------------
/src/test/resources/zip-slip-siva-files/git-zipslip.siva:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/jgit-spark-connector/79d05a0bcf0da435685d6118828a8884e2fe4b94/src/test/resources/zip-slip-siva-files/git-zipslip.siva


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/BaseSivaSpec.scala:
--------------------------------------------------------------------------------
1 | package tech.sourced.engine
2 | 
3 | trait BaseSivaSpec {
4 |   val resourcePath: String = getClass.getResource("/siva-files").toString
5 | }
6 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/BaseSourceSpec.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import org.apache.spark.SparkException
  4 | import org.scalatest._
  5 | 
  6 | class BaseSourceSpec(source: String = "BaseSourceSpec")
  7 |   extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec with BeforeAndAfterEach {
  8 | 
  9 |   var engine: Engine = _
 10 | 
 11 |   override protected def beforeEach(): Unit = {
 12 |     super.beforeAll()
 13 | 
 14 |     engine = Engine(ss, resourcePath, "siva")
 15 |   }
 16 | 
 17 |   source should "get heads of all repositories and count the files" in {
 18 |     val df = engine.getRepositories
 19 |       .getHEAD
 20 |       .getCommits
 21 |       .getTreeEntries
 22 |       .getBlobs
 23 |       .select("commit_hash", "path", "content", "is_binary")
 24 |       .distinct()
 25 |     df.count should be(457)
 26 |   }
 27 | 
 28 |   it should "count all the commit messages from all masters that are not forks" in {
 29 |     val commits = engine.getRepositories.filter("is_fork = false").getMaster.getAllReferenceCommits
 30 |     val df = commits.select("message").filter(commits("message").startsWith("a"))
 31 |     df.count should be(7)
 32 |   }
 33 | 
 34 |   it should "count all commits messages from all references that are not forks" in {
 35 |     val commits = engine.getRepositories.filter("is_fork = false").getReferences
 36 |       .getAllReferenceCommits
 37 |     val df = commits.select("message", "reference_name", "hash").
 38 |       filter(commits("message").startsWith("a"))
 39 |     df.count should be(98)
 40 |   }
 41 | 
 42 |   it should "get all files from HEADS that are Ruby" in {
 43 |     val blobs = engine.getRepositories.filter("is_fork = false")
 44 |       .getHEAD
 45 |       .getCommits
 46 |       .getTreeEntries
 47 |       .getBlobs
 48 |       .classifyLanguages
 49 |     val df = blobs.filter(blobs("lang") === "Ruby").select("lang", "path")
 50 |     df.count should be(169)
 51 |   }
 52 | 
 53 |   it should "get all tree entries" in {
 54 |     val df = engine.getRepositories.getReferences.getAllReferenceCommits.getTreeEntries
 55 |     df.count() should be(304362)
 56 |   }
 57 | 
 58 |   it should "filter by reference from repos dataframe" in {
 59 |     val spark = ss
 60 | 
 61 |     val df = Engine(spark, resourcePath, "siva")
 62 |       .getRepositories
 63 |       .getReference("refs/heads/develop")
 64 |     assert(df.count == 2)
 65 |   }
 66 | 
 67 |   "Filter by HEAD reference" should "return only HEAD references" in {
 68 |     val spark = ss
 69 |     val df = Engine(spark, resourcePath, "siva").getRepositories.getHEAD
 70 |     assert(df.count == 5)
 71 |   }
 72 | 
 73 |   "Filter by master reference" should "return only master references" in {
 74 |     val df = engine.getRepositories.getMaster
 75 |     assert(df.count == 5)
 76 |   }
 77 | 
 78 |   "Get develop commits" should "return only develop commits" in {
 79 |     val df = engine.getRepositories
 80 |       .getReference("refs/heads/develop").getAllReferenceCommits
 81 |       .select("hash", "repository_id")
 82 |     assert(df.count == 103)
 83 |   }
 84 | 
 85 |   "Get files after reading commits" should "return the correct files" in {
 86 |     val files = engine.getRepositories
 87 |       .getReferences
 88 |       .getAllReferenceCommits
 89 |       .getBlobs
 90 |       .drop("repository_id", "reference_name")
 91 |       .distinct()
 92 | 
 93 |     assert(files.count == 91944)
 94 |   }
 95 | 
 96 |   "Get files without reading tree entries" should "return the correct files" in {
 97 |     val files = engine.getRepositories
 98 |       .getReferences
 99 |       .getAllReferenceCommits
100 |       .getBlobs
101 |       .drop("repository_id", "reference_name")
102 |       .distinct()
103 | 
104 |     assert(files.count == 91944)
105 |   }
106 | 
107 |   "Get files" should "return the correct files" in {
108 |     val df = engine.getRepositories.getHEAD.getAllReferenceCommits
109 |       .sort("hash").limit(10)
110 |     val rows = df.collect()
111 |       .map(row => (row.getString(row.fieldIndex("repository_id")),
112 |         row.getString(row.fieldIndex("hash"))))
113 |     val repositories = rows.map(_._1)
114 |     val hashes = rows.map(_._2)
115 | 
116 |     val files = engine
117 |       .getBlobs(repositories.distinct, List("refs/heads/HEAD"), hashes.distinct)
118 |       .drop("repository_id", "reference_name")
119 |       .distinct()
120 | 
121 |     assert(files.count == 655)
122 |   }
123 | 
124 |   it should "return the correct files if we filter by repository" in {
125 |     val files = engine
126 |       .getBlobs(repositoryIds = List("github.com/xiyou-linuxer/faq-xiyoulinux"))
127 |       .drop("repository_id", "reference_name")
128 |       .distinct()
129 | 
130 |     assert(files.count == 2421)
131 |   }
132 | 
133 |   it should "return the correct files if we filter by reference" in {
134 |     val files = engine
135 |       .getBlobs(referenceNames = List("refs/heads/develop"))
136 |       .drop("repository_id", "reference_name")
137 |       .distinct()
138 | 
139 |     assert(files.count == 425)
140 |   }
141 | 
142 |   it should "return the correct files if we filter by commit" in {
143 |     val files = engine
144 |       .getBlobs(commitHashes = List("fff7062de8474d10a67d417ccea87ba6f58ca81d"))
145 |       .drop("repository_id", "reference_name")
146 |       .distinct()
147 |     assert(files.count == 2)
148 |   }
149 | 
150 |   override protected def afterEach(): Unit = {
151 |     super.afterEach()
152 | 
153 |     engine = _: Engine
154 |   }
155 | 
156 | }
157 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/BaseSparkSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.scalatest.{BeforeAndAfterAll, Suite}
 5 | 
 6 | trait BaseSparkSpec extends BeforeAndAfterAll {
 7 |   this: Suite =>
 8 | 
 9 |   var ss: SparkSession = _
10 | 
11 |   override protected def beforeAll(): Unit = {
12 |     super.beforeAll()
13 |     ss = SparkSession.builder()
14 |       .appName("test").master("local[*]")
15 |       .config("spark.driver.host", "localhost")
16 |       .getOrCreate()
17 |     ss.registerUDFs()
18 |   }
19 | 
20 |   override protected def afterAll(): Unit = {
21 |     super.afterAll()
22 |     ss = null
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/DefaultSourceSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import java.nio.file.{Path, Paths}
 4 | import java.util.UUID
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | import org.eclipse.jgit.api.Git
 8 | 
 9 | class DefaultSourceSpec extends BaseSourceSpec("DefaultSource") {
10 | 
11 |   var tmpPath: Path = Paths.get(System.getProperty("java.io.tmpdir"), UUID.randomUUID.toString)
12 | 
13 |   override protected def beforeAll(): Unit = {
14 |     super.beforeAll()
15 | 
16 |     tmpPath.toFile.mkdir()
17 |   }
18 | 
19 |   "DefaultSource" should "not optimize if the conditions on the " +
20 |     "join are not the expected ones" in {
21 |     val repos = engine.getRepositories
22 |     val references = ss.read.format("tech.sourced.engine").option("table", "references").load()
23 |     val out = repos.join(references,
24 |       (references("repository_id") === repos("id"))
25 |         .and(references("name").startsWith("refs/pull"))
26 |     ).count()
27 | 
28 |     val df = references.limit(1).getCommits
29 |     df.count() should be(1)
30 |   }
31 | 
32 |   it should "return the remote branches renamed to refs/heads" in {
33 |     val repoDir = tmpPath.resolve("repo")
34 | 
35 |     Git.cloneRepository()
36 |       .setURI("https://github.com/src-d/jgit-spark-connector.git")
37 |       .setDirectory(repoDir.toFile)
38 |       .call()
39 | 
40 |     val engine = Engine(ss, tmpPath.toString, "standard")
41 |     val masters = engine.getRepositories
42 |       .getMaster
43 |       .collect()
44 |       .sortBy(_.getAs[String]("repository_id"))
45 | 
46 |     masters.length should be(2)
47 |     masters(0).getAs[String]("repository_id") should startWith("file")
48 |     masters(0).getAs[Boolean]("is_remote") should be(false)
49 | 
50 |     masters(1).getAs[String]("repository_id") should startWith("github")
51 |     masters(1).getAs[Boolean]("is_remote") should be(true)
52 | 
53 |     engine.getRepositories.getRemoteReferences.getMaster.count() should be(1)
54 |   }
55 | 
56 |   it should "match HEAD and not just refs/heads/HEAD" in {
57 |     val repoDir = tmpPath.resolve("repo")
58 | 
59 |     import tech.sourced.engine.util.RepoUtils._
60 | 
61 |     val repo = createRepo(repoDir)
62 |     commitFile(repo, "foo", "bar", "baz")
63 | 
64 |     Engine(ss, tmpPath.toString, "standard").getRepositories.getHEAD.count() should be(1)
65 |   }
66 | 
67 |   it should "traverse all commits if it's not chained" in {
68 |     val row = engine.session.sql("SELECT COUNT(*) FROM commits").first()
69 |     row(0) should be(4444)
70 | 
71 |     val row2 = engine.session.sql("SELECT COUNT(*) FROM commits WHERE index > 0").first()
72 |     row2(0) should be(4390)
73 |   }
74 | 
75 |   override protected def afterAll(): Unit = {
76 |     super.afterAll()
77 | 
78 |     FileUtils.deleteQuietly(tmpPath.toFile)
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/EngineSpec.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import java.nio.file.{Path, Paths}
  4 | import java.util.{Properties, UUID}
  5 | 
  6 | import org.apache.commons.io.FileUtils
  7 | import org.apache.spark.SparkException
  8 | import org.scalatest.{FlatSpec, Matchers}
  9 | 
 10 | class EngineSpec extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec {
 11 | 
 12 |   var engine: Engine = _
 13 |   var tmpPath: Path = _
 14 | 
 15 |   override protected def beforeAll(): Unit = {
 16 |     super.beforeAll()
 17 | 
 18 |     engine = Engine(ss, resourcePath, "siva")
 19 |     tmpPath = Paths.get(System.getProperty("java.io.tmpdir"))
 20 |       .resolve(UUID.randomUUID.toString)
 21 |     tmpPath.toFile.mkdir()
 22 |   }
 23 | 
 24 |   override def afterAll(): Unit = {
 25 |     super.afterAll()
 26 |     FileUtils.deleteQuietly(tmpPath.toFile)
 27 |   }
 28 | 
 29 |   "saveMetadata" should "store all metadata tables in a SQLite db" in {
 30 |     engine.saveMetadata(tmpPath.toString)
 31 | 
 32 |     val dbFile = tmpPath.resolve("engine_metadata.db")
 33 |     dbFile.toFile.exists should be(true)
 34 | 
 35 |     val properties = new Properties()
 36 |     properties.put("driver", "org.sqlite.JDBC")
 37 | 
 38 |     val reposDf = engine.getRepositories
 39 |     val refsDf = reposDf.getReferences
 40 |     val repoHasCommitsDf = refsDf.getAllReferenceCommits
 41 |       .select("reference_name", "repository_id", "hash", "index")
 42 |     val commitsDf = refsDf.getAllReferenceCommits
 43 |       .drop("index", "reference_name", "repository_id")
 44 |       .distinct()
 45 |     val treeEntriesDf = refsDf.getAllReferenceCommits.getTreeEntries
 46 |       .drop("reference_name", "repository_id")
 47 |       .distinct()
 48 | 
 49 |     Seq(
 50 |       (RepositoriesTable, reposDf),
 51 |       (ReferencesTable, refsDf),
 52 |       (RepositoryHasCommitsTable, repoHasCommitsDf),
 53 |       (CommitsTable, commitsDf),
 54 |       (TreeEntriesTable, treeEntriesDf)
 55 |     ).foreach {
 56 |       case (table, df) =>
 57 |         val count = df.count()
 58 |         ss.read.jdbc(s"jdbc:sqlite:$dbFile", Tables.prefix(table), properties)
 59 |           .count() should be(count)
 60 |     }
 61 |   }
 62 | 
 63 |   "skipReadErrors" should "skip all read errors" in {
 64 |     val resourcePath = getClass.getResource("/bad-siva-files").toString
 65 |     val engine = Engine(ss, resourcePath, "siva").skipReadErrors(true)
 66 |     val tmpPath = Paths.get(System.getProperty("java.io.tmpdir"))
 67 |       .resolve(UUID.randomUUID.toString)
 68 |     tmpPath.toFile.mkdir()
 69 | 
 70 |     val cnt = engine
 71 |       .getRepositories
 72 |       .getReferences
 73 |       .getCommits
 74 |       .getTreeEntries
 75 |       .getBlobs
 76 |       .count()
 77 | 
 78 |     cnt should be(8663)
 79 | 
 80 |     FileUtils.deleteQuietly(tmpPath.toFile)
 81 |   }
 82 | 
 83 |   "engine" should "throw an error when a siva file contains a zip-slip vulnerability" in {
 84 |     val resourcePath = getClass.getResource("/zip-slip-siva-files").toString
 85 |     val engine = Engine(ss, resourcePath, "siva")
 86 |     val tmpPath = Paths.get(System.getProperty("java.io.tmpdir"))
 87 |       .resolve(UUID.randomUUID.toString)
 88 |     tmpPath.toFile.mkdir()
 89 | 
 90 |     val ex = intercept[SparkException] {
 91 |       engine
 92 |         .getRepositories
 93 |         .getReferences
 94 |         .getCommits
 95 |         .getTreeEntries
 96 |         .getBlobs
 97 |         .count()
 98 |     }
 99 | 
100 |     ex.getCause.getMessage should be("Entry is outside of the target dir: objects/../../imoutside")
101 | 
102 |     FileUtils.deleteQuietly(tmpPath.toFile)
103 |   }
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/FilterUDFSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import org.scalatest.{FlatSpec, Matchers}
 4 | 
 5 | class FilterUDFSpec extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec {
 6 | 
 7 |   var engine: Engine = _
 8 | 
 9 |   override protected def beforeAll(): Unit = {
10 |     super.beforeAll()
11 |     engine = Engine(ss, resourcePath, "siva")
12 |   }
13 | 
14 |   "Filter by language" should "work properly" in {
15 |     val langDf = engine
16 |       .getRepositories
17 |       .getReferences
18 |       .getCommits
19 |       .getBlobs
20 |       .classifyLanguages
21 | 
22 |     val filteredLang = langDf.select("repository_id", "path", "lang").where("lang='Python'")
23 |     filteredLang.count() should be(6)
24 |   }
25 | 
26 |   override protected def afterAll(): Unit = {
27 |     super.afterAll()
28 |     engine = _: Engine
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/MetadataSourceSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import java.nio.file.{Path, Paths}
 4 | import java.util.UUID
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | 
 8 | class MetadataSourceSpec extends BaseSourceSpec("MetadataSource") {
 9 | 
10 |   private var tmpDir: Path = Paths.get(
11 |     System.getProperty("java.io.tmpdir"),
12 |     UUID.randomUUID().toString
13 |   )
14 | 
15 |   override protected def beforeAll(): Unit = {
16 |     super.beforeAll()
17 | 
18 |     tmpDir.toFile.mkdir()
19 | 
20 |     engine = Engine(ss, resourcePath, "siva")
21 |     engine.saveMetadata(tmpDir.toString)
22 |     engine = engine.fromMetadata(tmpDir.toString)
23 |   }
24 | 
25 |   override protected def afterAll(): Unit = {
26 |     super.afterAll()
27 |     FileUtils.deleteQuietly(tmpDir.toFile)
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/QueryBuilderSpec.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine
  2 | 
  3 | import java.sql.{Date, Timestamp}
  4 | 
  5 | import org.apache.spark.sql.catalyst.expressions._
  6 | import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType}
  7 | import org.apache.spark.unsafe.types.UTF8String
  8 | import org.scalatest.{FlatSpec, Matchers}
  9 | import QueryBuilder._
 10 | 
 11 | class QueryBuilderSpec extends FlatSpec with Matchers {
 12 | 
 13 |   "QueryBuilder.qualify" should "qualify and quote col" in {
 14 |     val expected = s"${prefixTable("foo")}.`bar`"
 15 |     qualify("foo", "bar") should be(expected)
 16 |     qualify(attr("foo", "bar")) should be(expected)
 17 |   }
 18 | 
 19 |   "QueryBuilder.compileValue" should "return compiled value" in {
 20 |     val now = System.currentTimeMillis
 21 |     val cases = Seq(
 22 |       (UTF8String.fromString("foo"), "'foo'"),
 23 |       ("fo'o", "'fo''o'"),
 24 |       (new Timestamp(now), s"'${new Timestamp(now)}'"),
 25 |       (new Date(now), s"'${new Date(now)}'"),
 26 |       (Seq("a", 1, true), "'a', 1, 1"),
 27 |       (true, 1),
 28 |       (false, 0)
 29 |     )
 30 | 
 31 |     cases.foreach {
 32 |       case (input, expected) =>
 33 |         compileValue(input) should be(expected)
 34 |     }
 35 |   }
 36 | 
 37 |   "QueryBuilder.compileFilter" should "compile the filters to SQL" in {
 38 |     val col = qualify("foo", "bar")
 39 |     val cases = Seq(
 40 |       (EqualTo(attr("foo", "bar"), Literal(1, IntegerType)),
 41 |         s"$col = 1"),
 42 |       (EqualNullSafe(attr("foo", "bar"), Literal(1, IntegerType)),
 43 |         s"(NOT ($col != 1 OR $col IS NULL OR 1 IS NULL) OR ($col IS NULL AND 1 IS NULL))"),
 44 |       (LessThan(attr("foo", "bar"), Literal(1, IntegerType)),
 45 |         s"$col < 1"),
 46 |       (GreaterThan(attr("foo", "bar"), Literal(1, IntegerType)),
 47 |         s"$col > 1"),
 48 |       (LessThanOrEqual(attr("foo", "bar"), Literal(1, IntegerType)),
 49 |         s"$col <= 1"),
 50 |       (GreaterThanOrEqual(attr("foo", "bar"), Literal(1, IntegerType)),
 51 |         s"$col >= 1"),
 52 |       (IsNull(attr("foo", "bar")), s"$col IS NULL"),
 53 |       (IsNotNull(attr("foo", "bar")), s"$col IS NOT NULL"),
 54 |       (In(attr("foo", "bar"), Seq()), s"CASE WHEN $col IS NULL THEN NULL ELSE FALSE END"),
 55 |       (In(attr("foo", "bar"), Seq(Literal(1, IntegerType), Literal(2, IntegerType))),
 56 |         s"$col IN (1, 2)"),
 57 |       (Not(EqualTo(attr("foo", "bar"), Literal(1, IntegerType))),
 58 |         s"(NOT ($col = 1))"),
 59 |       (Or(EqualTo(attr("foo", "bar"), Literal(1, IntegerType)),
 60 |         EqualTo(attr("foo", "bar"), Literal(2, IntegerType))
 61 |       ),
 62 |         s"(($col = 1) OR ($col = 2))"),
 63 |       (And(EqualTo(attr("foo", "bar"), Literal(1, IntegerType)),
 64 |         EqualTo(attr("foo", "bar"), Literal(2, IntegerType))
 65 |       ),
 66 |         s"($col = 1) AND ($col = 2)")
 67 |     )
 68 | 
 69 |     cases.foreach {
 70 |       case (expr, expected) =>
 71 |         compileFilter(expr).get should be(expected)
 72 |     }
 73 |   }
 74 | 
 75 |   "QueryBuilder.selectedFields" should "return SQL for selected tables" in {
 76 |     QueryBuilder(tables = Seq("repositories"))
 77 |       .selectedFields should be(s"${qualify("repositories", "id")}")
 78 | 
 79 |     QueryBuilder(fields = Seq(
 80 |       attr("repositories", "id"),
 81 |       attr("references", "name")
 82 |     )).selectedFields should be(
 83 |       s"${qualify("repositories", "id")}, ${qualify("references", "name")}"
 84 |     )
 85 |   }
 86 | 
 87 |   "QueryBuilder.whereClause" should "return SQL for where clause" in {
 88 |     QueryBuilder().whereClause should be("")
 89 | 
 90 |     QueryBuilder(filters = Seq(
 91 |       EqualTo(attr("foo", "bar"), Literal(1, IntegerType))
 92 |     )).whereClause should be(s"WHERE ${qualify("foo", "bar")} = 1")
 93 | 
 94 |     QueryBuilder(filters = Seq(
 95 |       EqualTo(attr("foo", "bar"), Literal(1, IntegerType)),
 96 |       EqualTo(attr("foo", "baz"), Literal(2, IntegerType))
 97 |     )).whereClause should be(s"WHERE ${qualify("foo", "bar")} = 1 AND ${qualify("foo", "baz")} = 2")
 98 |   }
 99 | 
100 |   "QueryBuilder.selectedTables" should "return SQL for selected tables" in {
101 |     QueryBuilder(tables = Seq("repositories"))
102 |       .selectedTables should be(s"${prefixTable("repositories")}")
103 | 
104 |     QueryBuilder(joins = Seq(
105 |       Join("repositories", "references", Seq(
106 |         JoinCondition("repositories", "id", "references", "repository_id")
107 |       )),
108 |       Join("references", "commits", Seq(
109 |         JoinCondition("references", "repository_id", "commits", "repository_id"),
110 |         JoinCondition("references", "name", "commits", "reference_name")
111 |       ))
112 |     )).selectedTables should be(s"${prefixTable("repositories")} INNER JOIN " +
113 |       s"${prefixTable("references")} ON (" +
114 |       s"${qualify("repositories", "id")} = ${qualify("references", "repository_id")}) INNER JOIN " +
115 |       s"${prefixTable("commits")} ON (${qualify("references", "repository_id")} = " +
116 |       s"${qualify("commits", "repository_id")} AND ${qualify("references", "name")} = " +
117 |       s"${qualify("commits", "reference_name")})")
118 |   }
119 | 
120 |   "QueryBuilder.sql" should "return SQL for the query" in {
121 |     QueryBuilder(
122 |       fields = Seq(attr("repositories", "id")),
123 |       tables = Seq("repositories"),
124 |       filters = Seq(EqualTo(attr("repositories", "id"), Literal("foo", StringType)))
125 |     ).sql should be(s"SELECT ${qualify("repositories", "id")} " +
126 |       s"FROM ${prefixTable("repositories")} " +
127 |       s"WHERE ${qualify("repositories", "id")} = ${compileValue("foo")}")
128 |   }
129 | 
130 |   def attr(table: String, name: String): Attribute =
131 |     AttributeReference(
132 |       name,
133 |       StringType,
134 |       nullable = false,
135 |       new MetadataBuilder().putString(Sources.SourceKey, table).build()
136 |     )()
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/StorageLevelSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine
 2 | 
 3 | import org.scalatest.{FlatSpec, Matchers}
 4 | 
 5 | class StorageLevelSpec  extends FlatSpec with Matchers with BaseSivaSpec with BaseSparkSpec {
 6 | 
 7 |   var engine: Engine = _
 8 | 
 9 |   override protected def beforeAll(): Unit = {
10 |     super.beforeAll()
11 |     engine = Engine(ss, resourcePath, "siva")
12 |   }
13 | 
14 |   "A Dataframe" should "work with all storage levels" in {
15 |     import org.apache.spark.storage.StorageLevel._
16 |     val storageLevels = List(
17 |       DISK_ONLY,
18 |       DISK_ONLY_2,
19 |       MEMORY_AND_DISK,
20 |       MEMORY_AND_DISK_2,
21 |       MEMORY_AND_DISK_SER,
22 |       MEMORY_AND_DISK_SER_2,
23 |       MEMORY_ONLY,
24 |       MEMORY_ONLY_2,
25 |       MEMORY_ONLY_SER,
26 |       MEMORY_ONLY_SER_2,
27 |       NONE,
28 |       OFF_HEAP
29 |     )
30 | 
31 |     storageLevels.foreach(level => {
32 |       val df = engine.getRepositories.persist(level)
33 |       df.count()
34 |       df.unpersist()
35 |     })
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/iterator/BaseChainableIterator.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.eclipse.jgit.lib.Repository
 6 | import org.scalatest.{Matchers, Suite}
 7 | import tech.sourced.engine.provider.{RepositoryProvider, RepositorySource, RepositoryRDDProvider}
 8 | import tech.sourced.engine.{BaseSivaSpec, BaseSparkSpec}
 9 | 
10 | trait BaseChainableIterator extends Suite with BaseSparkSpec with BaseSivaSpec with Matchers {
11 |   override def afterAll(): Unit = {
12 |     super.afterAll()
13 |     provider.close(source, repo)
14 |   }
15 | 
16 |   lazy val prov: RepositoryRDDProvider = RepositoryRDDProvider(ss.sparkContext)
17 |   lazy val rdd: RDD[RepositorySource] = prov.get(resourcePath, RepositoryRDDProvider.SivaFormat)
18 | 
19 |   lazy val source: RepositorySource = rdd.filter(source => source.pds.getPath()
20 |     .endsWith("fff7062de8474d10a67d417ccea87ba6f58ca81d.siva")).first()
21 |   lazy val provider: RepositoryProvider = RepositoryProvider("/tmp")
22 |   lazy val repo: Repository = provider.get(source)
23 | 
24 |   def testIterator(iterator: (Repository) => Iterator[Row],
25 |                    matcher: (Int, Row) => Unit,
26 |                    total: Int,
27 |                    columnsCount: Int): Unit = {
28 |     val ri: Iterator[Row] = iterator(repo)
29 | 
30 |     var count: Int = 0
31 |     while (ri.hasNext) {
32 |       val row: Row = ri.next()
33 |       row.length should be(columnsCount)
34 |       matcher(count, row)
35 |       count += 1
36 |     }
37 | 
38 |     count should be(total)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/iterator/GitTreeEntryIteratorSpec.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import org.scalatest.FlatSpec
  4 | import tech.sourced.engine.util.{Attr, EqualFilter, NotFilter}
  5 | 
  6 | class GitTreeEntryIteratorSpec extends FlatSpec with BaseChainableIterator {
  7 | 
  8 |   private val cols = Array(
  9 |     "commit_hash",
 10 |     "repository_id",
 11 |     "reference_name",
 12 |     "path",
 13 |     "blob"
 14 |   )
 15 | 
 16 |   private val allCommitsFilter = NotFilter(EqualFilter(Attr("index", "commits"), -1))
 17 | 
 18 |   "GitTreeEntryIterator" should "return all tree entries from all commits " +
 19 |     "from all repositories into a siva file" in {
 20 |     testIterator(repo =>
 21 |       new GitTreeEntryIterator(
 22 |         cols,
 23 |         repo,
 24 |         new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false),
 25 |         Seq(),
 26 |         false
 27 |       ), {
 28 |       case (0, row) =>
 29 |         row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
 30 |         row.getString(1) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
 31 |         row.getString(2) should be("refs/heads/HEAD")
 32 |         row.getString(3) should be("LICENSE")
 33 |         row.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c")
 34 |       case (1, row) =>
 35 |         row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
 36 |         row.getString(1) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
 37 |         row.getString(2) should be("refs/heads/HEAD")
 38 |         row.getString(3) should be("README.md")
 39 |         row.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126")
 40 |       case (2, row) =>
 41 |         row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
 42 |         row.getString(1) should be("github.com/mawag/faq-xiyoulinux")
 43 |         row.getString(2) should be("refs/heads/HEAD")
 44 |         row.getString(3) should be("LICENSE")
 45 |         row.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c")
 46 |       case (3, row) =>
 47 |         row.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
 48 |         row.getString(1) should be("github.com/mawag/faq-xiyoulinux")
 49 |         row.getString(2) should be("refs/heads/HEAD")
 50 |         row.getString(3) should be("README.md")
 51 |         row.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126")
 52 |       case _ =>
 53 |     }, total = 23189, columnsCount = cols.length
 54 |     )
 55 |   }
 56 | 
 57 |   it should "filter by path" in {
 58 |     val filters = Seq(EqualFilter(
 59 |       Attr("path", "tree_entries"),
 60 |       "README.md")
 61 |     )
 62 | 
 63 |     testIterator(repo =>
 64 |       new GitTreeEntryIterator(
 65 |         cols,
 66 |         repo,
 67 |         new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false),
 68 |         filters,
 69 |         false
 70 |       ), {
 71 |       case (_, r) =>
 72 |         r.getString(3) should be("README.md")
 73 |     }, total = 1062, columnsCount = cols.length
 74 |     )
 75 |   }
 76 | 
 77 |   it should "filter by blob" in {
 78 |     val filters = Seq(EqualFilter(
 79 |       Attr("blob", "tree_entries"),
 80 |       "733c072369ca77331f392c40da7404c85c36542c")
 81 |     )
 82 | 
 83 |     testIterator(repo =>
 84 |       new GitTreeEntryIterator(
 85 |         cols,
 86 |         repo,
 87 |         new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false),
 88 |         filters,
 89 |         false
 90 |       ), {
 91 |       case (_, r) =>
 92 |         r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c")
 93 |     }, total = 1062, columnsCount = cols.length
 94 |     )
 95 |   }
 96 | 
 97 |   it should "work when it's chained" in {
 98 |     val filters = Seq(EqualFilter(
 99 |       Attr("hash", "commits"),
100 |       "fff7062de8474d10a67d417ccea87ba6f58ca81d"),
101 |       allCommitsFilter
102 |     )
103 | 
104 |     testIterator(repo =>
105 |       new GitTreeEntryIterator(
106 |         cols,
107 |         repo,
108 |         new CommitIterator(Array("hash"), repo, null, filters, false),
109 |         Seq(),
110 |         false
111 |       ), {
112 |       case (i, r) if i % 2 == 0 =>
113 |         r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c")
114 |         r.getString(3) should be("LICENSE")
115 |         r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
116 | 
117 |       case (_, r) =>
118 |         r.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126")
119 |         r.getString(3) should be("README.md")
120 |         r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
121 |     }, total = 86, columnsCount = cols.length
122 |     )
123 |   }
124 | 
125 |   it should "filter by commit hash" in {
126 |     val filters = Seq(EqualFilter(
127 |       Attr("commit_hash", "tree_entries"),
128 |       "fff7062de8474d10a67d417ccea87ba6f58ca81d")
129 |     )
130 | 
131 |     testIterator(repo =>
132 |       new GitTreeEntryIterator(
133 |         cols,
134 |         repo,
135 |         new CommitIterator(cols, repo, null, Seq(allCommitsFilter), false),
136 |         filters,
137 |         false
138 |       ), {
139 |       case (i, r) if i % 2 == 0 =>
140 |         r.getString(4) should be("733c072369ca77331f392c40da7404c85c36542c")
141 |         r.getString(3) should be("LICENSE")
142 |         r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
143 | 
144 |       case (_, r) =>
145 |         r.getString(4) should be("2d2ad68c14c51e62595125b86b464427f6bf2126")
146 |         r.getString(3) should be("README.md")
147 |         r.getString(0) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
148 |     }, total = 86, columnsCount = cols.length
149 |     )
150 |   }
151 | 
152 | }
153 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/iterator/MetadataIteratorSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import java.nio.file.Paths
 4 | import java.util.{Properties, UUID}
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | import org.apache.spark.sql.Row
 8 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 9 | import org.apache.spark.sql.types.{Metadata, StringType, StructType}
10 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
11 | import tech.sourced.engine.{BaseSparkSpec, Schema}
12 | 
13 | class JDBCQueryIteratorSpec
14 |   extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec {
15 |   private val tmpPath = Paths.get(
16 |     System.getProperty("java.io.tmpdir"),
17 |     UUID.randomUUID.toString
18 |   )
19 | 
20 |   private val dbPath = tmpPath.resolve("test.db")
21 | 
22 |   override def beforeAll(): Unit = {
23 |     super.beforeAll()
24 |     tmpPath.toFile.mkdir()
25 |     val rdd = ss.sparkContext.parallelize(Seq(
26 |       Row("id1"),
27 |       Row("id2"),
28 |       Row("id3")
29 |     ))
30 | 
31 |     val properties = new Properties()
32 |     properties.put("driver", "org.sqlite.JDBC")
33 |     val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head)))
34 |     df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties)
35 |   }
36 | 
37 |   override def afterAll(): Unit = {
38 |     super.afterAll()
39 |     FileUtils.deleteQuietly(tmpPath.toFile)
40 |   }
41 | 
42 |   "JDBCQueryIterator" should "return all rows for the query" in {
43 |     val iter = new JDBCQueryIterator(
44 |       Seq(attr("id")),
45 |       dbPath.toString,
46 |       "SELECT id FROM repositories ORDER BY id"
47 |     )
48 | 
49 |     // calling hasNext more than one time does not cause rows to be lost
50 |     iter.hasNext
51 |     iter.hasNext
52 |     val rows = (for (row <- iter) yield row).toArray
53 |     rows.length should be(3)
54 |     rows(0).length should be(1)
55 |     rows(0)(0).toString should be("id1")
56 |     rows(1)(0).toString should be("id2")
57 |     rows(2)(0).toString should be("id3")
58 |   }
59 | 
60 |   private def attr(name: String): Attribute = AttributeReference(
61 |     name, StringType, nullable = false, Metadata.empty
62 |   )()
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/iterator/ReferenceIteratorSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.iterator
 2 | 
 3 | import org.scalatest.FlatSpec
 4 | import tech.sourced.engine.util.{Attr, EqualFilter}
 5 | 
 6 | class ReferenceIteratorSpec extends FlatSpec with BaseChainableIterator {
 7 | 
 8 |   "ReferenceIterator" should "return all references from all repositories into a siva file" in {
 9 |     testIterator(
10 |       new ReferenceIterator(Array("repository_id", "name", "hash"), _, null, Seq(), false), {
11 |         case (0, row) =>
12 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
13 |           row.getString(1) should be("refs/heads/HEAD")
14 |           row.getString(2) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
15 |         case (1, row) =>
16 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
17 |           row.getString(1) should be("refs/heads/HEAD")
18 |           row.getString(2) should be("fff7062de8474d10a67d417ccea87ba6f58ca81d")
19 |         case (2, row) =>
20 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
21 |           row.getString(1) should be("refs/heads/develop")
22 |           row.getString(2) should be("880653c14945dbbc915f1145561ed3df3ebaf168")
23 |         case _ =>
24 |       }, total = 43, columnsCount = 3
25 |     )
26 |   }
27 | 
28 |   it should "return only specified columns" in {
29 |     testIterator(
30 |       new ReferenceIterator(Array("repository_id", "name"), _, null, Seq(), false), {
31 |         case (0, row) =>
32 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
33 |           row.getString(1) should be("refs/heads/HEAD")
34 |         case (1, row) =>
35 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
36 |           row.getString(1) should be("refs/heads/HEAD")
37 |         case (2, row) =>
38 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
39 |           row.getString(1) should be("refs/heads/develop")
40 |         case _ =>
41 |       }, total = 43, columnsCount = 2
42 |     )
43 |   }
44 | 
45 |   it should "apply passed filters" in {
46 |     testIterator(
47 |       new ReferenceIterator(
48 |         Array("repository_id", "name"),
49 |         _,
50 |         null,
51 |         Seq(EqualFilter(Attr("name", "references"), "refs/heads/develop")),
52 |         false
53 |       ), {
54 |         case (0, row) =>
55 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
56 |           row.getString(1) should be("refs/heads/develop")
57 |         case (1, row) =>
58 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
59 |           row.getString(1) should be("refs/heads/develop")
60 |       }, total = 2, columnsCount = 2
61 |     )
62 |   }
63 | 
64 |   it should "use previously passed iterator" in {
65 |     testIterator(repo =>
66 |       new ReferenceIterator(
67 |         Array("repository_id", "name"),
68 |         repo,
69 |         new RepositoryIterator(
70 |           "/foo/bar",
71 |           Array("id"),
72 |           repo,
73 |           Seq(EqualFilter(Attr("id", "repository"), "github.com/xiyou-linuxer/faq-xiyoulinux")),
74 |           false
75 |         ),
76 |         Seq(EqualFilter(Attr("name", "references"), "refs/heads/develop")),
77 |         false
78 |       ), {
79 |       case (0, row) =>
80 |         row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
81 |         row.getString(1) should be("refs/heads/develop")
82 |     }, total = 1, columnsCount = 2
83 |     )
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/iterator/RepositoryIteratorSpec.scala:
--------------------------------------------------------------------------------
  1 | package tech.sourced.engine.iterator
  2 | 
  3 | import java.nio.file.Paths
  4 | import java.util.UUID
  5 | 
  6 | import org.apache.commons.io.FileUtils
  7 | import org.scalatest.{BeforeAndAfterEach, FlatSpec}
  8 | import tech.sourced.engine.provider.{RepositoryProvider, RepositoryRDDProvider}
  9 | import tech.sourced.engine.util.{Attr, EqualFilter}
 10 | 
 11 | class RepositoryIteratorSpec extends FlatSpec with BaseChainableIterator with BeforeAndAfterEach {
 12 | 
 13 |   private var tmpDir: java.nio.file.Path = _
 14 | 
 15 |   override def beforeEach(): Unit = {
 16 |     super.beforeEach()
 17 |     tmpDir = Paths.get(System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString)
 18 |     tmpDir.toFile.mkdir()
 19 |   }
 20 | 
 21 |   override def afterEach(): Unit = {
 22 |     super.afterEach()
 23 |     FileUtils.deleteQuietly(tmpDir.toFile)
 24 |   }
 25 | 
 26 |   "RepositoryIterator" should "return data for all repositories into a siva file" in {
 27 |     testIterator(
 28 |       new RepositoryIterator(
 29 |         "/foo/bar",
 30 |         Array("id", "urls", "is_fork", "repository_path"),
 31 |         _,
 32 |         Seq(),
 33 |         false
 34 |       ), {
 35 |         case (0, row) =>
 36 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
 37 |           row.getAs[Array[String]](1).length should be(3)
 38 |           row.getBoolean(2) should be(false)
 39 |           row.getString(3) should be("/foo/bar")
 40 |         case (1, row) =>
 41 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
 42 |           row.getAs[Array[String]](1).length should be(3)
 43 |           row.getBoolean(2) should be(true)
 44 |           row.getString(3) should be("/foo/bar")
 45 |         case (c, _) => fail(s"unexpected row number: $c")
 46 |       }, total = 2, columnsCount = 4
 47 |     )
 48 |   }
 49 | 
 50 |   it should "return only specified columns" in {
 51 |     testIterator(
 52 |       new RepositoryIterator("/foo/bar", Array("id", "is_fork"), _, Seq(), false), {
 53 |         case (0, row) =>
 54 |           row.getString(0) should be("github.com/xiyou-linuxer/faq-xiyoulinux")
 55 |           row.getBoolean(1) should be(false)
 56 |         case (1, row) =>
 57 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
 58 |           row.getBoolean(1) should be(true)
 59 |         case (c, _) => fail(s"unexpected row number: $c")
 60 |       }, total = 2, columnsCount = 2
 61 |     )
 62 |   }
 63 | 
 64 |   it should "apply passed filters" in {
 65 |     testIterator(
 66 |       new RepositoryIterator(
 67 |         "/foo/bar",
 68 |         Array("id", "is_fork"),
 69 |         _,
 70 |         Seq(EqualFilter(Attr("id", "repository"), "github.com/mawag/faq-xiyoulinux")),
 71 |         false
 72 |       ), {
 73 |         case (0, row) =>
 74 |           row.getString(0) should be("github.com/mawag/faq-xiyoulinux")
 75 |           row.getBoolean(1) should be(true)
 76 |         case (c, _) => fail(s"unexpected row number: $c")
 77 |       }, total = 1, columnsCount = 2
 78 |     )
 79 |   }
 80 | 
 81 |   it should "return a repository for each distinct remote and the local dir" in {
 82 |     import tech.sourced.engine.util.RepoUtils._
 83 | 
 84 |     val gitRepo = createRepo(tmpDir.resolve("repo"))
 85 | 
 86 |     addRemote(gitRepo, "repo", "git@github.com:git/repo.git")
 87 | 
 88 |     FileUtils.write(tmpDir.resolve("repo").resolve("README.md").toFile, "hello world")
 89 |     gitRepo.add().addFilepattern("README.md").call()
 90 |     gitRepo.commit().setMessage("first commit on regular repo").call()
 91 | 
 92 |     val rdd = RepositoryRDDProvider(ss.sparkContext)
 93 |       .get(tmpDir.toString, RepositoryRDDProvider.StandardFormat)
 94 |     val source = rdd.first()
 95 |     val provider = RepositoryProvider(tmpDir.toString)
 96 |     val repo = provider.get(source)
 97 | 
 98 |     val iter = new RepositoryIterator("/foo/bar", Array("id"), repo, Seq(), false)
 99 |     val repos = iter.toList
100 | 
101 |     repos.length should be(2)
102 |     repos.head(0).toString should be("github.com/git/repo")
103 |     repos(1)(0).toString should startWith("file://")
104 | 
105 |     provider.close(source, repo)
106 |   }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/provider/RepositoryRDDProviderSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.provider
 2 | 
 3 | import java.nio.file.{Path, Paths}
 4 | import java.util.UUID
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
 8 | import tech.sourced.engine.util.RepoUtils
 9 | import tech.sourced.engine.{BaseSivaSpec, BaseSparkSpec}
10 | 
11 | class RepositoryRDDProviderSpec extends FlatSpec with Matchers with BeforeAndAfterEach
12 |   with BaseSparkSpec with BaseSivaSpec {
13 | 
14 |   private var provider: RepositoryRDDProvider = _
15 |   private var tmpPath: Path = _
16 | 
17 |   override def beforeEach(): Unit = {
18 |     super.beforeEach()
19 |     provider = RepositoryRDDProvider(ss.sparkContext)
20 |     tmpPath = Paths.get(
21 |       System.getProperty("java.io.tmpdir"),
22 |       UUID.randomUUID().toString
23 |     )
24 |   }
25 | 
26 |   override def afterEach(): Unit = {
27 |     super.afterEach()
28 | 
29 |     FileUtils.deleteQuietly(tmpPath.toFile)
30 |   }
31 | 
32 |   "RepositoryRDDProvider" should "retrieve bucketized raw repositories" in {
33 |     tmpPath.resolve("a").toFile.mkdir()
34 |     createRepo(tmpPath.resolve("a").resolve("repo"))
35 | 
36 |     tmpPath.resolve("b").toFile.mkdir()
37 |     createRepo(tmpPath.resolve("b").resolve("repo"))
38 | 
39 |     createRepo(tmpPath.resolve("repo"))
40 | 
41 |     val repos = provider.get(tmpPath.toString, "standard").collect()
42 |     repos.length should be(3)
43 |   }
44 | 
45 |   it should "retrieve non-bucketized raw repositories" in {
46 |     tmpPath.resolve("a").toFile.mkdir()
47 |     createRepo(tmpPath.resolve("repo"))
48 | 
49 |     tmpPath.resolve("b").toFile.mkdir()
50 |     createRepo(tmpPath.resolve("repo2"))
51 | 
52 |     val repos = provider.get(tmpPath.toString, "standard").collect()
53 |     repos.length should be(2)
54 |   }
55 | 
56 |   it should "retrieve bucketized siva repositories" in {
57 |     val repos = provider.get(resourcePath, "siva").collect()
58 |     repos.length should be(3)
59 |   }
60 | 
61 |   it should "retrieve non-bucketized siva repositories" in {
62 |     val repos = provider.get(Paths.get(resourcePath, "ff").toString, "siva").collect()
63 |     repos.length should be(1)
64 |   }
65 | 
66 |   private def createRepo(path: Path) = {
67 |     val repo = RepoUtils.createRepo(path)
68 |     RepoUtils.commitFile(repo, "file.txt", "something something", "some commit")
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/util/FilterSpec.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.util
 2 | 
 3 | import org.apache.spark.sql.catalyst.expressions._
 4 | import org.apache.spark.sql.types.StringType
 5 | import org.scalatest.{FlatSpec, Matchers}
 6 | 
 7 | class FilterSpec extends FlatSpec with Matchers {
 8 |   "CompiledFilters" should "filter properly depending of his type" in {
 9 |     val eq = EqualFilter(Attr("test", ""), "a")
10 | 
11 |     eq.eval("a") should be(true)
12 |     eq.eval("b") should be(false)
13 | 
14 |     val notEq = NotFilter(EqualFilter(Attr("test", ""), "a"))
15 | 
16 |     notEq.eval("a") should be(false)
17 |     notEq.eval("b") should be(true)
18 | 
19 |     val in = InFilter(Attr("test", ""), Array("a", "b", "c"))
20 | 
21 |     in.eval("a") should be(true)
22 |     in.eval("b") should be(true)
23 |     in.eval("c") should be(true)
24 |     in.eval("d") should be(false)
25 | 
26 |     val gt = GreaterThanFilter(Attr("test", ""), 5)
27 | 
28 |     gt.eval(4) should be(false)
29 |     gt.eval(5) should be(false)
30 |     gt.eval(6) should be(true)
31 | 
32 |     val gte = GreaterThanOrEqualFilter(Attr("test", ""), 5)
33 | 
34 |     gte.eval(4) should be(false)
35 |     gte.eval(5) should be(true)
36 |     gte.eval(6) should be(true)
37 | 
38 |     val lt = LessThanFilter(Attr("test", ""), 5)
39 | 
40 |     lt.eval(4) should be(true)
41 |     lt.eval(5) should be(false)
42 |     lt.eval(6) should be(false)
43 | 
44 |     val lte = LessThanOrEqualFilter(Attr("test", ""), 5)
45 | 
46 |     lte.eval(4) should be(true)
47 |     lte.eval(5) should be(true)
48 |     lte.eval(6) should be(false)
49 |   }
50 | 
51 |   "ColumnFilter" should "process correctly columns" in {
52 |     // test = 'val' AND test IS NOT NULL AND test2 = 'val2' AND test3 IN ('a', 'b')
53 |     val f = Filter.compile(And(
54 |       And(
55 |         And(
56 |           EqualTo(AttributeReference("test", StringType)(), Literal("val")),
57 |           IsNotNull(AttributeReference("test", StringType)())
58 |         ),
59 |         EqualTo(AttributeReference("test2", StringType)(), Literal("val2"))
60 |       ),
61 |       In(AttributeReference("test3", StringType)(), Seq(Literal("a"), Literal("b")))
62 |     ))
63 | 
64 |     f.length should be(4)
65 |     val filters = Filters(f)
66 |     filters.matches(Seq("test"), "val") should be(true)
67 |     filters.matches(Seq("test2"), "val") should be(false)
68 |     filters.matches(Seq("test3"), "b") should be(true)
69 |   }
70 | 
71 |   "ColumnFilter" should "handle correctly unsupported filters" in {
72 |     val f = Filter.compile(StartsWith(AttributeReference("test", StringType)(), Literal("a")))
73 | 
74 |     f.length should be(0)
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/scala/tech/sourced/engine/util/RepoUtils.scala:
--------------------------------------------------------------------------------
 1 | package tech.sourced.engine.util
 2 | 
 3 | import java.nio.file.{Path, Paths}
 4 | 
 5 | import org.apache.commons.io.FileUtils
 6 | import org.eclipse.jgit.api.CreateBranchCommand.SetupUpstreamMode
 7 | import org.eclipse.jgit.api.Git
 8 | import org.eclipse.jgit.revwalk.RevCommit
 9 | import org.eclipse.jgit.transport.URIish
10 | 
11 | object RepoUtils {
12 | 
13 |   def createBareRepo(path: Path): Git = {
14 |     Git.init().setBare(true).setDirectory(path.toFile).call()
15 |   }
16 | 
17 |   def createRepo(path: Path): Git = {
18 |     Git.init().setDirectory(path.toFile).call()
19 |   }
20 | 
21 |   def addRemote(repo: Git, name: String, url: String): Unit = {
22 |     val cmd = repo.remoteAdd()
23 |     cmd.setName(name)
24 |     cmd.setUri(new URIish(url))
25 |     cmd.call()
26 |   }
27 | 
28 |   def commitFile(repo: Git, name: String, content: String, msg: String): RevCommit = {
29 |     val file = Paths.get(repo.getRepository.getDirectory.getParent, name)
30 |     FileUtils.write(file.toFile, content)
31 |     repo.add().addFilepattern(name).call()
32 |     repo.commit().setMessage(msg).call()
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------