├── .dockerignore ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── codeql-analysis.yml │ ├── dev-deploy.yaml │ ├── docker-build.yml │ └── release-deploy.yaml ├── .gitignore ├── .gitmodules ├── .gitpod.Dockerfile ├── .gitpod.yml ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── Release-Checklist.md ├── bin ├── dockler.sh ├── kickstart.sh ├── sce.sh └── sparkler.sh ├── build.sbt ├── conf ├── domain-suffixes.xml ├── felix-config.properties ├── log4j.properties ├── log4j2.properties ├── regex-urlfilter.txt ├── solr-schema-map.yaml ├── solr │ ├── crawldb │ │ ├── conf │ │ │ ├── _rest_managed.json │ │ │ ├── currency.xml │ │ │ ├── enumsConfig.xml │ │ │ ├── lang │ │ │ │ └── stopwords_en.txt │ │ │ ├── managed-schema │ │ │ ├── protwords.txt │ │ │ ├── solrconfig.xml │ │ │ ├── stopwords.txt │ │ │ └── synonyms.txt │ │ └── core.properties │ ├── solr.xml │ └── sparkler-jetty-context.xml ├── sparkler-default.yaml └── user-agents.txt ├── docs ├── .gitignore ├── README.md ├── Sparkler-Dashboard.png ├── _config.yml ├── _includes │ ├── disqus.html │ ├── footer.html │ ├── google_analytics.html │ ├── header.html │ └── navigation.html ├── _layouts │ ├── default.html │ └── page.html ├── _posts │ ├── .gitkeep │ ├── 2017-12-26-contributing-to-docs.md │ └── 2017-12-26-development-environment-setup.md ├── bin │ └── jekyll-page ├── changelog.md ├── css │ ├── main.css │ └── syntax.css ├── index.md ├── presentations │ └── Sparkler-for-SparkSummitEast17.pdf └── proposal │ ├── Diagram.xml │ └── Sparkler-Flow.svg ├── plugins.build.sbt ├── project ├── Dependencies.scala ├── PluginDependencies.scala ├── Settings.scala ├── build.properties ├── metals.sbt ├── plugins.sbt └── project │ └── metals.sbt ├── release.sh ├── retired ├── sparkler-sce │ ├── LICENSE │ ├── README.md │ ├── compose │ │ └── docker-compose.yaml │ ├── deployment.yaml │ ├── deployment │ │ └── Dockerfile │ ├── evaluation │ │ └── phase1 │ │ │ ├── readme.md │ │ │ └── relevancy-model.txt │ └── webui │ │ ├── Dockerfile │ │ ├── app │ │ ├── __init__.py │ │ ├── apis │ │ │ ├── __init__.py │ │ │ ├── ns_classify.py │ │ │ └── ns_search.py │ │ ├── classifier │ │ │ └── __init__.py │ │ ├── controller.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── model.py │ │ └── search │ │ │ ├── __init__.py │ │ │ └── fetcher.py │ │ ├── config.py │ │ ├── keywords.txt │ │ ├── requirements.txt │ │ ├── run.py │ │ ├── run.wsgi │ │ ├── wait-for-it.sh │ │ └── waitress_server.py └── sparkler-ui │ ├── Dockerfile │ ├── README.md │ ├── database.yaml │ ├── deployment.yaml │ ├── package-lock.json │ ├── package.json │ ├── public │ ├── favicon.ico │ ├── index.html │ └── manifest.json │ ├── scripts │ ├── 000-default.conf │ └── run.sh │ └── src │ ├── App.css │ ├── App.js │ ├── App.test.js │ ├── actions │ ├── index.js │ └── test.js │ ├── components │ ├── Banana.js │ ├── IFrameFrame.js │ ├── Navbar.js │ ├── Previews.js │ ├── RouterButton.js │ ├── Solr.js │ ├── VisitDashboard.js │ ├── dialogs │ │ ├── CrawlConfigDialog.js │ │ ├── ModelDialog.js │ │ ├── SeedURLDialog.js │ │ └── StartCrawlDialog.js │ ├── panels │ │ ├── ListModels.js │ │ └── NewModel.js │ └── sidebar │ │ ├── CrawlConfig.js │ │ ├── CreateSeed.js │ │ ├── ExportData.js │ │ ├── ExportModel.js │ │ ├── GenerateModel.js │ │ ├── StartCrawl.js │ │ └── StartCrawl_old.js │ ├── index.css │ ├── index.js │ ├── list.csv │ ├── logo.svg │ ├── reducers │ ├── index.js │ ├── model.js │ ├── search.js │ ├── test.js │ └── types.js │ ├── serviceWorker.js │ ├── utils │ └── constants.js │ └── views │ ├── BananaFrame.js │ ├── Home.js │ ├── Main.js │ └── SolrFrame.js ├── scalastyle-config.xml ├── sparkler-api └── src │ ├── main │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ ├── AbstractExtensionPoint.java │ │ ├── Config.java │ │ ├── ConfigKey.java │ │ ├── Constants.java │ │ ├── ExtensionChain.java │ │ ├── ExtensionPoint.java │ │ ├── Fetcher.java │ │ ├── GenericProcess.java │ │ ├── JobContext.java │ │ ├── Scorer.java │ │ ├── SparklerConfiguration.java │ │ ├── SparklerException.java │ │ ├── URLFilter.java │ │ ├── URLNormalizer.java │ │ ├── UrlInjectorObj.java │ │ ├── model │ │ ├── FetchedData.java │ │ ├── MultiMap.java │ │ ├── Resource.java │ │ └── ResourceStatus.java │ │ ├── storage │ │ ├── FieldMapper.java │ │ └── StringEvaluator.java │ │ └── util │ │ ├── CustomHttpRequestExecutor.java │ │ ├── DomainSuffix.java │ │ ├── DomainSuffixes.java │ │ ├── DomainSuffixesReader.java │ │ ├── FetcherDefault.java │ │ ├── StreamTransformer.java │ │ ├── StringUtil.java │ │ ├── TestUtils.java │ │ ├── TopLevelDomain.java │ │ └── URLUtil.java │ └── test │ ├── java │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── util │ │ └── FetcherDefaultTest.java │ └── resources │ ├── domain-suffixes.xml │ ├── sparkler-default.yaml │ └── user-agents.txt ├── sparkler-app └── src │ ├── assembly │ └── dep.xml │ ├── main │ ├── resources │ │ ├── domain-suffixes.xml │ │ ├── log4j2.properties │ │ ├── regex-urlfilter.txt │ │ ├── solr-schema-map.yaml │ │ ├── sparkler-default.yaml │ │ └── user-agents.txt │ └── scala │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ ├── Main.scala │ │ ├── base │ │ ├── CliTool.scala │ │ ├── Loggable.scala │ │ └── SparklerSink.scala │ │ ├── model │ │ ├── CrawlData.scala │ │ ├── ParsedData.scala │ │ └── SparklerJob.scala │ │ ├── pipeline │ │ ├── Crawler.scala │ │ ├── CrawlerRunner.scala │ │ ├── FairFetcher.scala │ │ ├── FetchFunction.scala │ │ ├── GenericFunction.scala │ │ ├── OutLinkFilterFunction.scala │ │ ├── ParseFunction.scala │ │ ├── RunCrawl.scala │ │ ├── ScoreFunction.scala │ │ ├── SparklerProducer.scala │ │ └── UrlInjectorFunction.scala │ │ ├── service │ │ ├── Dumper.scala │ │ ├── Injector.scala │ │ ├── PluginService.scala │ │ └── RejectingURLFilterChain.scala │ │ ├── storage │ │ ├── ScoreUpdateTransformer.scala │ │ ├── SparklerGroupPartition.scala │ │ ├── StatusUpdate.scala │ │ ├── StatusUpdateTransformer.scala │ │ ├── StorageProxy.scala │ │ ├── StorageProxyFactory.scala │ │ ├── StorageRDD.scala │ │ ├── Upserter.scala │ │ ├── elasticsearch │ │ │ ├── ElasticsearchDeepRDD.scala │ │ │ ├── ElasticsearchProxy.scala │ │ │ ├── ElasticsearchRDD.scala │ │ │ ├── ElasticsearchResultIterator.scala │ │ │ ├── ElasticsearchUpsert.scala │ │ │ ├── ScoreUpdateElasticsearchTransformer.scala │ │ │ └── StatusUpdateElasticsearchTransformer.scala │ │ └── solr │ │ │ ├── ContentHash.scala │ │ │ ├── ScoreUpdateSolrTransformer.scala │ │ │ ├── SolrDeepRDD.scala │ │ │ ├── SolrProxy.scala │ │ │ ├── SolrRDD.scala │ │ │ ├── SolrResultIterator.scala │ │ │ ├── SolrUpsert.scala │ │ │ └── StatusUpdateSolrTransformer.scala │ │ └── util │ │ ├── FileDumperTool.scala │ │ ├── HealthChecks.scala │ │ ├── JobUtil.scala │ │ └── NutchBridge.scala │ └── test │ └── resources │ └── sparkler-default.yaml ├── sparkler-deployment ├── docker-k8s │ └── Dockerfile ├── docker │ ├── Dockerfile │ ├── Dockerfile.solr │ ├── conf │ │ └── sparkler-default.yaml │ ├── elasticsearch │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── docker-compose.yml │ │ ├── dockler.py │ │ └── scripts │ │ │ ├── greeting.sh │ │ │ └── start.sh │ ├── jetty-csp-patch │ │ └── jetty.xml │ └── runsparkler.sh ├── helm │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ └── deployment.yaml │ ├── test.yaml │ └── values.yaml ├── juju │ ├── bundles │ │ ├── sparkler-basic-spark │ │ │ ├── README.md │ │ │ └── bundle.yaml │ │ └── sparkler-basic │ │ │ ├── README.md │ │ │ └── bundle.yaml │ └── sparkler │ │ ├── README.md │ │ ├── actions.yaml │ │ ├── actions │ │ ├── addseedurls │ │ ├── crawl │ │ ├── inject │ │ ├── removeallseedurls │ │ └── removeseedurls │ │ ├── config.yaml │ │ ├── icon.svg │ │ ├── layer.yaml │ │ ├── metadata.yaml │ │ ├── reactive │ │ └── sparkler.py │ │ ├── resources.yaml │ │ └── wheelhouse.txt ├── sparkler-init │ └── Dockerfile └── sparkler-k8s │ └── deployment.yaml ├── sparkler-plugins ├── databricks-api-plugin │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── kytheralabs │ │ └── databricks │ │ ├── DatabricksAPI.java │ │ ├── DatabricksAPIActivator.java │ │ └── Persistence.java ├── fetcher-chrome │ └── src │ │ └── main │ │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── plugin │ │ ├── FetcherChrome.java │ │ ├── FetcherChromeActivator.java │ │ └── ProxySelector.java ├── fetcher-htmlunit │ ├── README.md │ └── src │ │ ├── main │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ ├── HtmlUnitFetcher.java │ │ │ └── HtmlUnitFetcherActivator.java │ │ └── test │ │ ├── java │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ └── HtmlUnitFetcherTest.java │ │ └── resources │ │ └── log4j.properties ├── fetcher-jbrowser │ └── src │ │ ├── main │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ ├── FetcherJBrowser.java │ │ │ └── FetcherJBrowserActivator.java │ │ └── test │ │ ├── java │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ └── FetcherJBrowserTest.java │ │ └── resources │ │ └── log4j.properties ├── scorer-dd-svn │ └── src │ │ ├── main │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ ├── DdSvnScorer.java │ │ │ ├── DdSvnScorerActivator.java │ │ │ └── ddsvn │ │ │ └── ApacheHttpRestClient.java │ │ └── test │ │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── plugin │ │ └── DdSvnScorerTest.java ├── template-plugin │ └── src │ │ ├── main │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ ├── MyPlugin.java │ │ │ └── MyPluginActivator.java │ │ └── test │ │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── plugin │ │ └── MyPluginTest.java ├── url-injector │ └── src │ │ └── main │ │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── plugin │ │ ├── UrlInjector.java │ │ └── UrlInjectorActivator.java ├── urlfilter-regex │ └── src │ │ ├── main │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ ├── RegexURLFilter.java │ │ │ ├── RegexURLFilterActivator.java │ │ │ └── regex │ │ │ ├── RegexRule.java │ │ │ └── RegexURLFilterBase.java │ │ └── test │ │ ├── java │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ └── sparkler │ │ │ └── plugin │ │ │ └── RegexURLFilterTest.java │ │ └── resources │ │ └── regex-urlfilter.txt └── urlfilter-samehost │ └── src │ ├── main │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── plugin │ │ ├── UrlFilterSameHost.java │ │ └── UrlFilterSameHostActivator.java │ └── test │ └── java │ └── edu │ └── usc │ └── irds │ └── sparkler │ └── plugin │ └── UrlFilterSameHostTest.java ├── sparkler-tests-base └── src │ └── main │ ├── java │ └── edu │ │ └── usc │ │ └── irds │ │ └── sparkler │ │ └── test │ │ ├── TestSlaveServlet.java │ │ ├── WebServer.java │ │ └── WebServerRunListener.java │ └── resources │ └── webapp │ ├── index.html │ └── jspage.html └── version.sbt /.dockerignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # ignore everything 20 | * 21 | 22 | # except these 23 | !build 24 | !sparkler-ui/ 25 | !conf 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | #### Issue Description 2 | 3 | Please describe our issue, along with: 4 | - expected behavior 5 | - encountered behavior 6 | 7 | #### How to reproduce it 8 | If you are describing a bug, please describe here how to reproduce it. 9 | 10 | #### Environment and Version Information 11 | 12 | Please indicate relevant versions, including, if relevant: 13 | 14 | * Java Version 15 | * Spark Version 16 | * Operating System name and version 17 | 18 | 19 | #### An external links for reference 20 | 21 | If you think any other resources on internet will be helpful to understand and/or resolve this issue, please share them here. 22 | 23 | #### Contributing 24 | 25 | If you'd like to help us fix the issue by contributing some code, but would like guidance or help in doing so, please mention it! 26 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What changes were proposed in this pull request? 2 | 3 | (Please fill in changes proposed in this fix) 4 | 5 | **Is this related to an already existing issue on sparkler?** 6 | If so, mention that issue by referencing its number here. 7 | 8 | **Will it close an existing issue?** 9 | Say 'Closes #IssueNum' here. 10 | 11 | 12 | ### How was this patch tested? 13 | 14 | We are particularly interested in unit tests, integration tests, manual tests you did to ensure that the patch works as expected, so briefly describe them. 15 | 16 | 17 | Please review 18 | https://github.com/USCDataScience/sparkler/blob/master/.github/CONTRIBUTING.md before opening a pull request. 19 | -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | env: 4 | REGISTRY: ghcr.io 5 | IMAGE_NAME: ${{ github.repository }}/sparkler 6 | 7 | on: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | jobs: 14 | 15 | build: 16 | 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Log in to the Container registry 22 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 23 | with: 24 | registry: ${{ env.REGISTRY }} 25 | username: ${{ github.actor }} 26 | password: ${{ secrets.GITHUB_TOKEN }} 27 | - name: Extract metadata (tags, labels) for Docker 28 | id: meta 29 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 30 | with: 31 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 32 | - name: Build and push Docker image 33 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 34 | with: 35 | context: sparkler-core 36 | push: true 37 | tags: ${{ steps.meta.outputs.tags }} 38 | labels: ${{ steps.meta.outputs.labels }} 39 | file: sparkler-deployment/docker/Dockerfile 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Custom 17 | .bloop 18 | **/scalastyle-config 19 | **/.scalafmt.conf 20 | **/.metals 21 | **/.bsp 22 | 23 | # Standard Java 24 | *.class 25 | *.log 26 | *.jar 27 | *.war 28 | core/build/** 29 | 30 | # sbt specific 31 | .cache 32 | .history 33 | .lib/ 34 | dist/* 35 | target/ 36 | lib_managed/ 37 | src_managed/ 38 | project/boot/ 39 | project/plugins/project/ 40 | 41 | # Scala-IDE specific 42 | .scala_dependencies 43 | .worksheet 44 | **~ 45 | sparkler-job-** 46 | .idea/ 47 | *.iml 48 | 49 | # Eclipse-IDE specific 50 | .classpath 51 | .project 52 | .cache-main 53 | .settings/ 54 | 55 | . auto generated files 56 | sjob-** 57 | 58 | # MAC Files 59 | .DS_Store 60 | 61 | # Application Files 62 | /resources 63 | felix-cache/ 64 | 65 | tmp* 66 | workspace* 67 | sparkler-core/build/* 68 | sparkler-ui/node_modules/* 69 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/.gitmodules -------------------------------------------------------------------------------- /.gitpod.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM registry.gitlab.com/spiculedata/custom-gitpod-full:latest 2 | 3 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.0-linux-x86_64.tar.gz && \ 4 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.0-linux-x86_64.tar.gz.sha512 && \ 5 | shasum -a 512 -c elasticsearch-7.17.0-linux-x86_64.tar.gz.sha512 && \ 6 | tar -xzf elasticsearch-7.17.0-linux-x86_64.tar.gz 7 | -------------------------------------------------------------------------------- /.gitpod.yml: -------------------------------------------------------------------------------- 1 | image: 2 | file: .gitpod.Dockerfile 3 | 4 | tasks: 5 | - init: sbt assembly package 6 | command: cd /home/gitpod/elasticsearch-7.17.0/ && ./bin/elasticsearch -d -p pid 7 | ports: 8 | - port: 8080 9 | visibility: public 10 | onOpen: open-browser 11 | - port: 9200 12 | visibility: public 13 | onOpen: open-browser 14 | github: 15 | prebuilds: 16 | # enable for the default branch (defaults to true) 17 | master: true 18 | # enable for all branches in this repo (defaults to false) 19 | branches: true 20 | # enable for pull requests coming from this repo (defaults to true) 21 | pullRequests: true 22 | # enable for pull requests coming from forks (defaults to false) 23 | pullRequestsFromForks: false 24 | # add a check to pull requests (defaults to true) 25 | addCheck: true 26 | # add a "Review in Gitpod" button as a comment to pull requests (defaults to false) 27 | addComment: true 28 | # add a "Review in Gitpod" button to the pull request's description (defaults to false) 29 | addBadge: true 30 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.watcherExclude": { 3 | "**/target": true 4 | } 5 | } -------------------------------------------------------------------------------- /Release-Checklist.md: -------------------------------------------------------------------------------- 1 | 2 | # Release checklist 3 | 4 | Contributors: Thamme Gowda 5 | 6 | 7 | ## Update Versions 8 | 9 | Update the version to $VERSION=x.y.z 10 | 11 | mvn versions:set -DnewVersion=$VERSION 12 | mvn versions:commit 13 | 14 | ## Build the project 15 | 16 | mvn clean package 17 | 18 | 19 | ## Build docker image 20 | 21 | docker build -f sparkler-deployment/docker/Dockerfile . -t sparkler-local 22 | 23 | 24 | ### Push docker image to Docker Hub 25 | 26 | docker login 27 | # the account should have push permission to repo https://hub.docker.com/r/uscdatascience/sparkler 28 | 29 | docker tag sparkler-local uscdatascience/sparkler:$VERSION 30 | docker push uscdatascience/sparkler:$VERSION 31 | 32 | # Also make it as local 33 | docker tag sparkler-local uscdatascience/sparkler:latest 34 | docker push uscdatascience/sparkler:latest 35 | 36 | 37 | ## Release Jars to Maven Central 38 | 39 | TODO: complete this 40 | It is work in progress https://issues.sonatype.org/browse/OSSRH-36816 41 | -------------------------------------------------------------------------------- /bin/sparkler.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Attempt to resolve the sparkler jar using relative paths 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | DIR="$DIR/.." 6 | 7 | JAR=`echo $DIR/sparkler-app-*-SNAPSHOT.jar` 8 | if [ -f "$JAR" ] 9 | then 10 | # run 11 | # -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 12 | java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$DIR/plugins edu.usc.irds.sparkler.Main $@ 13 | exit 0 14 | fi 15 | 16 | # Attempt to resolve the sparkler jar using absolute paths 17 | # We do this because in the elastic-search deployment we add sparkler.sh to /usr/bin 18 | # In that case the Sparkler jar cannot be resolved via relative paths. 19 | # The followig code block resolves the absolute location of this script on disk 20 | # We assume that it is located in sparkler-core/bin/ 21 | SOURCE="${BASH_SOURCE[0]}" 22 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink 23 | DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" 24 | SOURCE="$(readlink "$SOURCE")" 25 | [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located 26 | done 27 | DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" 28 | 29 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 30 | SPARKLER_BUILD_DIR="$DIR/../build" 31 | JAR=`echo $DIR/../sparkler-app-*/lib` 32 | #if [ ! -f "$JAR" ] 33 | # then 34 | # echo "ERROR: Can't find Sparkler Jar at $JAR. 35 | # Looks like the jar is not built. Please refer to build instructions. Or see ./dockler.sh" 36 | # exit 2 37 | #fi 38 | 39 | # run 40 | # debugging lines 41 | # -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 42 | #java -Xms1g -cp $DIR/../conf:$JAR/* -Dpf4j.pluginsDir=$DIR/../plugins edu.usc.irds.sparkler.Main $@ 43 | java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$SPARKLER_BUILD_DIR/plugins edu.usc.irds.sparkler.Main $@ 44 | -------------------------------------------------------------------------------- /conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | status = error 19 | name = PropertiesConfig 20 | 21 | filters = threshold 22 | 23 | filter.threshold.type = ThresholdFilter 24 | filter.threshold.level = INFO 25 | 26 | appenders = console 27 | 28 | appender.console.type = Console 29 | appender.console.name = STDOUT 30 | appender.console.layout.type = PatternLayout 31 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 32 | 33 | rootLogger.level = warn 34 | rootLogger.appenderRefs = stdout 35 | rootLogger.appenderRef.stdout.ref = STDOUT 36 | logger.irds.name = edu.usc.irds 37 | logger.irds.level=DEBUG 38 | 39 | #rootLogger.level = INFO 40 | #rootLogger.appenderRefs = STDOUT 41 | #rootLogger.appenderRef.stdout.ref = STDOUT 42 | #logger.irds.name = edu.usc.irds 43 | #logger.irds.level=DEBUG 44 | #logger.kythera.name = com.kytheralabs 45 | #logger.kythera.level = DEBUG 46 | #logger.spicule.name = uk.co.spicule 47 | #logger.spicule.level = DEBUG -------------------------------------------------------------------------------- /conf/log4j2.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | status = error 20 | name = PropertiesConfig 21 | 22 | filters = threshold 23 | 24 | filter.threshold.type = ThresholdFilter 25 | filter.threshold.level = INFO 26 | 27 | appenders = console 28 | 29 | appender.console.type = Console 30 | appender.console.name = STDOUT 31 | appender.console.layout.type = PatternLayout 32 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 33 | 34 | #rootLogger.level = INFO 35 | #rootLogger.appenderRefs = STDOUT 36 | #rootLogger.appenderRef.stdout.ref = STDOUT 37 | #logger.irds.name = edu.usc.irds 38 | #logger.irds.level=DEBUG 39 | #logger.kythera.name = com.kytheralabs 40 | #logger.kythera.level = DEBUG 41 | #logger.spicule.name = uk.co.spicule 42 | #logger.spicule.level = DEBUG -------------------------------------------------------------------------------- /conf/regex-urlfilter.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # The default url filter. 18 | # Better for whole-internet crawling. 19 | 20 | # Each non-comment, non-blank line contains a regular expression 21 | # prefixed by '+' or '-'. The first matching pattern in the file 22 | # determines whether a URL is included or ignored. If no pattern 23 | # matches, the URL is ignored. 24 | 25 | # skip file: ftp: and mailto: urls 26 | -^(file|ftp|mailto): 27 | 28 | # Default: skip image and other suffixes which produces large content 29 | # for a more extensive coverage use the urlfilter-suffix plugin 30 | -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4)$ 31 | 32 | # skip URLs containing certain characters as probable queries, etc. 33 | #-[?*!@=] 34 | 35 | # skip URLs with slash-delimited segment that repeats 3+ times, to break loops 36 | -.*(/[^/]+)/[^/]+\1/[^/]+\1/ 37 | 38 | # accept any HTTP URL 39 | +^https?:// 40 | 41 | # reject the rest 42 | -. 43 | -------------------------------------------------------------------------------- /conf/solr-schema-map.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | ################## Solr Schema Map Properties ####################### 17 | 18 | #overrides: 19 | # id: id 20 | 21 | typeSuffix: 22 | java.lang.String: _t 23 | java.lang.Integer: _i 24 | java.lang.Long: _l 25 | java.lang.Boolean: _b 26 | java.lang.Float: _f 27 | java.lang.Double: _d 28 | java.util.Date: _dt 29 | 30 | multiValSuffix: s -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/_rest_managed.json: -------------------------------------------------------------------------------- 1 | {"initArgs":{},"managedList":[]} 2 | -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/enumsConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | UNFETCHED 22 | FETCHING 23 | FETCHED 24 | IGNORED 25 | ERROR 26 | 27 | 28 | -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #Standard english stop words taken from Lucene's StopAnalyzer 17 | a 18 | an 19 | and 20 | are 21 | as 22 | at 23 | be 24 | but 25 | by 26 | for 27 | if 28 | in 29 | into 30 | is 31 | it 32 | no 33 | not 34 | of 35 | on 36 | or 37 | s 38 | such 39 | t 40 | that 41 | the 42 | their 43 | then 44 | there 45 | these 46 | they 47 | this 48 | to 49 | was 50 | will 51 | with 52 | -------------------------------------------------------------------------------- /conf/solr/crawldb/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | aaafoo => aaabar 16 | bbbfoo => bbbfoo bbbbar 17 | cccfoo => cccbar cccbaz 18 | fooaaa,baraaa,bazaaa 19 | 20 | # Some synonym groups specific to this example 21 | GB,gib,gigabyte,gigabytes 22 | MB,mib,megabyte,megabytes 23 | Television, Televisions, TV, TVs 24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | #after us won't split it into two words. 26 | 27 | # Synonym mappings can be used for spelling correction too 28 | pixima => pixma 29 | 30 | -------------------------------------------------------------------------------- /conf/solr/crawldb/core.properties: -------------------------------------------------------------------------------- 1 | #Written by CorePropertiesLocator 2 | #Thu Oct 06 05:54:35 UTC 2016 3 | name=crawldb 4 | -------------------------------------------------------------------------------- /conf/solr/solr.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 27 | 28 | 29 | 30 | 31 | 32 | ${host:} 33 | ${jetty.port:8983} 34 | ${hostContext:solr} 35 | 36 | ${genericCoreNodeNames:true} 37 | 38 | ${zkClientTimeout:30000} 39 | ${distribUpdateSoTimeout:600000} 40 | ${distribUpdateConnTimeout:60000} 41 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider} 42 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider} 43 | 44 | 45 | 46 | 48 | ${socketTimeout:600000} 49 | ${connTimeout:60000} 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /conf/solr/sparkler-jetty-context.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | /solr-webapp/sparkler 6 | false 7 | 8 | -------------------------------------------------------------------------------- /conf/user-agents.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # User agents to be used 17 | # Each line contains an agent 18 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client1 19 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client2 20 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client3 21 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client4 -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | *.sw? 2 | _site 3 | _pages 4 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Sparkler Docs 2 | 3 | Read the docs at http://irds.usc.edu/sparkler 4 | -------------------------------------------------------------------------------- /docs/Sparkler-Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/Sparkler-Dashboard.png -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # Site title and subtitle. This is used in _includes/header.html 2 | title: 'Sparkler' 3 | subtitle: 'Spark Crawler' 4 | 5 | # if you wish to integrate disqus on pages set your shortname here 6 | disqus_shortname: 'Sparkler' 7 | 8 | # if you use google analytics, add your tracking id here 9 | google_analytics_id: 'UA-77850818-1' 10 | 11 | # Enable/show navigation. There are there options: 12 | # 0 - always hide 13 | # 1 - always show 14 | # 2 - show only if posts are present 15 | navigation: 1 16 | 17 | # URL to source code, used in _includes/footer.html 18 | codeurl: 'https://github.com/USCDataScience/sparkler' 19 | 20 | # Default categories (in order) to appear in the navigation 21 | sections: [ 22 | ['doc', 'Documentation'], 23 | ['tut', 'Tutorial'], 24 | ['ref', 'Reference'], 25 | ['dev', 'Developers'], 26 | ['post', 'Posts'] 27 | ] 28 | 29 | # Keep as an empty string if served up at the root. If served up at a specific 30 | # path (e.g. on GitHub pages) leave off the trailing slash, e.g. /my-project 31 | baseurl: '/sparkler' 32 | 33 | # Dates are not included in permalinks 34 | permalink: none 35 | 36 | # Syntax highlighting 37 | highlighter: rouge 38 | 39 | # Since these are pages, it doesn't really matter 40 | future: true 41 | 42 | # Exclude non-site files 43 | exclude: ['bin', 'README.md', 'presentations', 'proposal', 'Sparkler-Dashboard.png'] 44 | 45 | # Use the kramdown Markdown renderer 46 | markdown: kramdown 47 | redcarpet: 48 | extensions: [ 49 | 'no_intra_emphasis', 50 | 'fenced_code_blocks', 51 | 'autolink', 52 | 'strikethrough', 53 | 'superscript', 54 | 'with_toc_data', 55 | 'tables', 56 | 'hardwrap' 57 | ] 58 | -------------------------------------------------------------------------------- /docs/_includes/disqus.html: -------------------------------------------------------------------------------- 1 |
2 | 13 | 14 | -------------------------------------------------------------------------------- /docs/_includes/footer.html: -------------------------------------------------------------------------------- 1 | Documentation for {{ site.title }} 2 | -------------------------------------------------------------------------------- /docs/_includes/google_analytics.html: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /docs/_includes/header.html: -------------------------------------------------------------------------------- 1 |

{{ site.title }} 2 | {% if site.subtitle %}{{ site.subtitle }}{% endif %} 3 |

4 | -------------------------------------------------------------------------------- /docs/_includes/navigation.html: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /docs/_layouts/page.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | 10 | 11 | {{ content }} 12 | -------------------------------------------------------------------------------- /docs/_posts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/_posts/.gitkeep -------------------------------------------------------------------------------- /docs/_posts/2017-12-26-contributing-to-docs.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Contributing to Docs" 4 | category: dev 5 | date: 2017-12-26 15:27:29 6 | --- 7 | 8 | Contributions are welcome all the way - big or small, including adding tutorials and how-to's! 9 | 10 | This page helps how to update documentation. 11 | 12 | To add a new page to this website 13 | 14 | ```bash 15 | ruby bin/jekyll-page "Page Title" 16 | ``` 17 | 18 | `` can be: 19 | 20 | - `doc` - Documentation 21 | - `tut` - Tutorial 22 | - `ref` - Reference 23 | - `dev` - Developers 24 | - `post` - Posts 25 | 26 | For example, if you want to write a tutorial about **Crawling images using Sparkler** 27 | 28 | 29 | 30 | ```bash 31 | ruby bin/jekyll-page "Crawling Images using Sparkler" tut 32 | ``` 33 | 34 | Then edit the markdown file under `_posts/` directory. 35 | 36 | Then follow the standard github contribution guideline. 37 | If not already, fork this project from [https://github.com/USCDataScience/sparkler](https://github.com/USCDataScience/sparkler) to https://github.com//sparkler 38 | 39 | ```bash 40 | git remote add own git@github.com//sparkler 41 | git add docs/_posts/* 42 | git commit -m 'Added documentation for ___' 43 | git push own 44 | ``` 45 | 46 | Then raise a pull request at [https://github.com/USCDataScience/sparkler](https://github.com/USCDataScience/sparkler) using the github web UI. 47 | 48 | Contact developers on [slack](/sparkler/#slack) if you have questions. 49 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/changelog.md -------------------------------------------------------------------------------- /docs/css/main.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-weight: 400; 3 | text-shadow: 0 1px 1px rgba(255, 255, 255, 0.7); 4 | } 5 | 6 | pre, code, pre code { 7 | border: none; 8 | border-radius: 0; 9 | background-color: #f9f9f9; 10 | font-size: 0.85em; 11 | } 12 | 13 | .highlight { 14 | background-color: #f9f9f9; 15 | } 16 | 17 | pre { 18 | font-size: 1em; 19 | } 20 | 21 | code { 22 | color: inherit; 23 | } 24 | 25 | #header { 26 | border-bottom: 1px solid #eee; 27 | margin-bottom: 20px; 28 | } 29 | 30 | #header a:hover { 31 | text-decoration: none; 32 | } 33 | 34 | #footer { 35 | margin: 20px 0; 36 | font-size: 0.85em; 37 | color: #999; 38 | text-align: center; 39 | } 40 | 41 | #content > .page-header:first-child { 42 | margin-top: 0; 43 | } 44 | 45 | #content > .page-header:first-child h2 { 46 | margin-top: 0; 47 | } 48 | 49 | 50 | #navigation { 51 | font-size: 0.9em; 52 | } 53 | 54 | #navigation li a { 55 | padding-left: 10px; 56 | padding-right: 10px; 57 | } 58 | 59 | #navigation .nav-header { 60 | padding-left: 0; 61 | padding-right: 0; 62 | } 63 | 64 | body.rtl { 65 | direction: rtl; 66 | } 67 | 68 | body.rtl #header .brand { 69 | float: right; 70 | margin-left: 5px; 71 | } 72 | body.rtl .row-fluid [class*="span"] { 73 | float: right !important; 74 | margin-left: 0; 75 | margin-right: 2.564102564102564%; 76 | } 77 | body.rtl .row-fluid [class*="span"]:first-child { 78 | margin-right: 0; 79 | } 80 | 81 | body.rtl ul, body.rtl ol { 82 | margin: 0 25px 10px 0; 83 | } 84 | 85 | table { 86 | margin-bottom: 1rem; 87 | border: 1px solid #e5e5e5; 88 | border-collapse: collapse; 89 | } 90 | 91 | td, th { 92 | padding: .25rem .5rem; 93 | border: 1px solid #e5e5e5; 94 | } 95 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: "USC-IRDS Sparkler Documentation" 4 | --- 5 | 6 | **Sparkler** is a modern crawler powered by Apache Spark. 7 | 8 | 9 | ## Getting Help 10 | 11 | ### Using GitHub effectively 12 | If you have caught a bug, or need a feature, please create an issue on github: https://github.com/USCDataScience/sparkler/issues/new 13 | 14 | 15 | ### [Using Slack Channel](#slack) 16 | Two step process: 17 | 1. Join the Slack group using [this invitation link](https://join.slack.com/t/uscdatascience/shared_invite/enQtMjkwMTMzMDA2MTMxLWQwZjAyYTA3MDc4MjkyOTZlNzEyNzkxMGU3MzY5MWM0NDdmNWE1MmQxMWUwZjU0YWViMzBjNzg0YTM0NzE5ODg) 18 | 2. Navigate to the **Sparkler** channel located at [https://uscdatascience.slack.com/messages/sparkler](https://uscdatascience.slack.com/messages/sparkler) 19 | 20 | ### Using Mailing List 21 | Send your questions to the mailing list irds-l@usc.edu 22 | 23 | ## Developers and Contributors 24 | 25 | [Full list is here](https://github.com/USCDataScience/sparkler/graphs/contributors) 26 | 27 | 28 | --- 29 | 30 | ## Contributing 31 | - [Modifications to Source Code](/sparkler/dev/development-environment-setup.html#contributing-source) 32 | - [Updating documentation](/sparkler/dev/contributing-to-docs.html) 33 | 34 | -------------------------------------------------------------------------------- /docs/presentations/Sparkler-for-SparkSummitEast17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/presentations/Sparkler-for-SparkSummitEast17.pdf -------------------------------------------------------------------------------- /project/PluginDependencies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import sbt._ 19 | 20 | // Define global plugin dependencies here 21 | object PluginDependencies {} 22 | 23 | object FetcherChrome { 24 | object Selenium { 25 | private val group = "org.seleniumhq.selenium" 26 | private val version = "3.141.59" 27 | lazy val chromeDriver = group % "selenium-chrome-driver" % version 28 | lazy val java = group % "selenium-java" % version 29 | } 30 | lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT" 31 | lazy val seleniumscripter = "uk.co.spicule" % "seleniumscripter" % "1.7.9" 32 | lazy val magnesium_script = "uk.co.spicule" % "magnesium-script" % "0.2.0" 33 | } 34 | 35 | object FetcherHtmlUnit { 36 | lazy val htmlUnit = "net.sourceforge.htmlunit" % "htmlunit" % "2.26" 37 | } 38 | 39 | object FetcherJBrowser { 40 | lazy val jBrowser = "com.machinepublishers" % "jbrowserdriver" % "0.16.4" 41 | } 42 | 43 | object ScorerDdSvn { 44 | lazy val httpClient = "org.apache.httpcomponents" % "httpclient" % "4.3.6" 45 | } 46 | 47 | object Databricks { 48 | lazy val wrapper = "com.kytheralabs" % "webcrawlerwrapper_2.12" % "0.1-SNAPSHOT" 49 | } 50 | 51 | object UrlFilterSameHost { 52 | lazy val guava = "com.google.guava" % "guava" % "31.0.1-jre" 53 | } -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.5.0 2 | -------------------------------------------------------------------------------- /project/metals.sbt: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT! This file is auto-generated. 2 | 3 | // This file enables sbt-bloop to create bloop config files. 4 | 5 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.10-8-8d1cbc4f") 6 | 7 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 18 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4") 19 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") 20 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13") 21 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13") 22 | addSbtPlugin("com.gilcloud" % "sbt-gitlab" % "0.0.6") -------------------------------------------------------------------------------- /project/project/metals.sbt: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT! This file is auto-generated. 2 | 3 | // This file enables sbt-bloop to create bloop config files. 4 | 5 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.10-8-8d1cbc4f") 6 | 7 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Script : release.sh 18 | # Usage : ./release.sh 19 | # Description: Release Sparkler Silently - Create tag with version in version.sbt and bump it 20 | 21 | sbt clean package test && sbt releaseSilent 22 | -------------------------------------------------------------------------------- /retired/sparkler-sce/README.md: -------------------------------------------------------------------------------- 1 | # polar-domain-discovery 2 | Domain Discovery on Any Domain 3 | -------------------------------------------------------------------------------- /retired/sparkler-sce/compose/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | single-server-int: 4 | networks: 5 | - sparkler 6 | image: arangodb:3.4.6 7 | ports: 8 | - 8529:8529 9 | environment: 10 | - ARANGO_NO_AUTH=1 11 | volumes: 12 | - db:/var/lib/arangodb3 13 | sce-solr: 14 | image: uscdatascience/sparkler-solr:latest 15 | networks: 16 | - sparkler 17 | ports: 18 | - "8983:8983" 19 | volumes: 20 | - data:/opt/solr/server/solr/mycores 21 | command: ['/data/solr/bin/solr','start','-f'] 22 | 23 | sce-splash: 24 | image: scrapinghub/splash 25 | networks: 26 | - sparkler 27 | ports: 28 | - 8050:8050 29 | 30 | sce-ui: 31 | image: uscdatascience/sparkler-ui:latest 32 | networks: 33 | - sparkler 34 | ports: 35 | - "8080:80" 36 | volumes: 37 | - /var/run/docker.sock:/var/run/docker.sock 38 | - models:/models 39 | volumes: 40 | data: 41 | db: 42 | models: 43 | 44 | networks: 45 | sparkler: 46 | external: 47 | name: sparkler 48 | -------------------------------------------------------------------------------- /retired/sparkler-sce/deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3-buster 2 | RUN apt update && apt install -y docker.io 3 | 4 | WORKDIR /projects/sce-domain-discovery/webui 5 | 6 | COPY webui/requirements.txt /projects/sce-domain-discovery/webui/ 7 | 8 | RUN pip install -r requirements.txt && mkdir /models && mkdir /images 9 | 10 | COPY . /projects/sce-domain-discovery/ 11 | 12 | 13 | CMD ["python", "waitress_server.py"] 14 | -------------------------------------------------------------------------------- /retired/sparkler-sce/evaluation/phase1/readme.md: -------------------------------------------------------------------------------- 1 | **Seed Exploration URLs** 2 | 3 | https://docs.google.com/spreadsheets/d/1rbSE1v8Cu9_NQYpKtquvgdLUzJHxkWoOdR6rxs4u7Gg/edit#gid=0 4 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3-buster 2 | RUN apt update && apt install -y docker.io 3 | 4 | WORKDIR /projects/sce-domain-discovery/webui 5 | 6 | COPY webui/requirements.txt /projects/sce-domain-discovery/webui/ 7 | 8 | RUN pip install -r requirements.txt && mkdir /models && mkdir /images 9 | 10 | COPY . /projects/sce-domain-discovery/ 11 | 12 | 13 | CMD ["python", "waitress_server.py"] 14 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/app/__init__.py: -------------------------------------------------------------------------------- 1 | """Bootstrap the API""" 2 | 3 | import logging 4 | 5 | from flask import Flask 6 | # Import a module / component using its blueprint handler variable 7 | from app.controller import MOD_APP as app_module 8 | # Import flask and template operators 9 | from app.apis import API_OBJ 10 | 11 | # Define the WSGI application object 12 | APP = Flask(__name__, 13 | static_url_path='', 14 | static_folder='static') 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | # Configurations 19 | APP.config.from_object('config') 20 | 21 | 22 | # Register blueprint(s) 23 | APP.register_blueprint(app_module) 24 | 25 | 26 | # Initialize flask-restplus 27 | API_OBJ.init_app(APP) 28 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/app/apis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup the API and add the required namespaces 3 | """ 4 | 5 | from flask_restplus import Api 6 | 7 | from app.apis.ns_search import API as search_api 8 | from app.apis.ns_classify import API as classify_api 9 | 10 | API_OBJ = Api(title='Seed Generation', version='1.0', 11 | description='Tool to generate seeds for Domain Discovery', doc='/doc') 12 | 13 | API_OBJ.add_namespace(search_api) 14 | API_OBJ.add_namespace(classify_api) 15 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/app/apis/ns_classify.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classify Endpoints for the REST API. 3 | """ 4 | import json 5 | import os 6 | 7 | from flask_restplus import Namespace, Resource 8 | from flask import request 9 | from app import classifier 10 | 11 | PFX = os.getenv('API_PFX', '') 12 | 13 | 14 | API = Namespace('classify', description='Interact with the ML model', path=PFX+'/classify') 15 | 16 | 17 | @API.route('/predict', methods=['GET', 'POST']) 18 | class Predict(Resource): 19 | """Predict a result""" 20 | @API.doc('predict') 21 | @staticmethod 22 | def get(content, model): 23 | """ 24 | Predict using ML model 25 | :param content: 26 | :param model: 27 | :return: 28 | """ 29 | classes = { 30 | -1: 'Model doesn\'t exist', 31 | 0: 'Not Relevant', 32 | 1: 'Relevant', 33 | 2: 'Highly Relevant' 34 | } 35 | args = request.args 36 | if len(args) != 0: 37 | content = args['content'] 38 | if content: 39 | result = classifier.predict(model, content) 40 | return classes[result] 41 | print('NO CONTENT FOUND') 42 | return classes[-1] 43 | 44 | @API.doc('predict') 45 | @staticmethod 46 | def post(): 47 | """ 48 | Predict using ML model 49 | :return: 50 | """ 51 | classes = { 52 | -1: 'Model doesn\'t exist', 53 | 0: 'Not Relevant', 54 | 1: 'Relevant', 55 | 2: 'Highly Relevant' 56 | } 57 | result = -1 58 | data = request.data 59 | loaded_data = json.loads(data.decode('utf-8', 'ignore')) 60 | if len(data) != 0: 61 | content = loaded_data['score'][0]['content'] 62 | if content is None: 63 | return classes[-1] 64 | 65 | model = loaded_data['score'][0]['model'] 66 | result = classifier.predict(model, content) 67 | return classes[result] 68 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/app/apis/ns_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search Endpoints for the REST API 3 | """ 4 | import json 5 | import os 6 | 7 | from flask_restplus import Namespace, Resource, cors 8 | from flask import current_app as a 9 | from pyArango.theExceptions import DocumentNotFoundError 10 | from werkzeug.exceptions import BadRequest 11 | from app import search 12 | 13 | PFX = os.getenv('API_PFX', '') 14 | 15 | API = Namespace('search', description='Query Duck Duck Go for results', path=PFX+'/search') 16 | 17 | 18 | @API.route('//') 19 | @API.param('query', 'Query string to search') 20 | class Search(Resource): 21 | """ Search a resource """ 22 | @classmethod 23 | @API.doc('search') 24 | @cors.crossdomain(origin='*') 25 | def get(cls, model, query): 26 | """ 27 | Search Duck Duck Go 28 | :param model: 29 | :param query: 30 | :return: 31 | """ 32 | a.logger.debug('Search Called!') 33 | try: 34 | url_details = search.query_and_fetch(query, model, top_n=12) 35 | except DocumentNotFoundError as exception: 36 | print(exception) 37 | raise BadRequest('Model Not Found') 38 | 39 | return json.dumps(url_details) 40 | 41 | 42 | @API.route('///') 43 | @API.param('query', 'Query string to search') 44 | @API.param('page', 'Results Page') 45 | class SearchPaginated(Resource): 46 | """Execute a paginated search""" 47 | @classmethod 48 | @API.doc('searchpaginated') 49 | @cors.crossdomain(origin='*') 50 | def get(cls, model, query, page): 51 | """ 52 | Search Duck Duck Go 53 | :param model: 54 | :param query: 55 | :param page: 56 | :return: 57 | """ 58 | a.logger.debug('Paged Search Called!') 59 | try: 60 | url_details = search.query_and_fetch(query, model, page=int(page), top_n=12) 61 | except DocumentNotFoundError as exception: 62 | print(exception) 63 | raise BadRequest('Model Not Found') 64 | 65 | return json.dumps(url_details) 66 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/app/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-sce/webui/app/models/__init__.py -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/config.py: -------------------------------------------------------------------------------- 1 | """Flask config options""" 2 | import os 3 | # Statement for enabling the development environment 4 | DEBUG = True 5 | 6 | # Define the application directory 7 | BASE_DIR = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | # Application threads. A common general assumption is 10 | # using 2 per available processor cores - to handle 11 | # incoming requests using one and performing background 12 | # operations using the other. 13 | THREADS_PER_PAGE = 24 14 | 15 | # Enable protection agains *Cross-site Request Forgery (CSRF)* 16 | CSRF_ENABLED = True 17 | 18 | # Use a secure, unique and absolutely secret key for 19 | # signing the data. 20 | CSRF_SESSION_KEY = 'secretDD' 21 | 22 | # Secret key for signing cookies 23 | SECRET_KEY = 'secretDD' 24 | 25 | # Path to the uploads 26 | UPLOAD_FOLDER = '..' 27 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/keywords.txt: -------------------------------------------------------------------------------- 1 | This is a test 2 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/requirements.txt: -------------------------------------------------------------------------------- 1 | aniso8601==8.0.0 2 | astroid==2.3.3 3 | attrs==19.3.0 4 | beautifulsoup4==4.9.0 5 | bs4==0.0.1 6 | certifi==2020.4.5.1 7 | chardet==3.0.4 8 | click==7.1.1 9 | DateTime==4.3 10 | Flask==1.1.2 11 | Flask-Cors==3.0.9 12 | flask-restplus==0.13.0 13 | future==0.18.2 14 | idna==2.9 15 | importlib-metadata==1.6.0 16 | isort==4.3.21 17 | itsdangerous==1.1.0 18 | Jinja2==2.11.3 19 | joblib==0.14.1 20 | jsonschema==3.2.0 21 | lazy-object-proxy==1.4.3 22 | MarkupSafe==1.1.1 23 | mccabe==0.6.1 24 | numpy==1.21.0 25 | pyArango==1.3.4 26 | pylint==2.4.4 27 | pyrsistent==0.16.0 28 | pytz==2019.3 29 | PyYAML==5.4 30 | requests==2.23.0 31 | scikit-learn==0.22.2.post1 32 | scipy==1.4.1 33 | six==1.14.0 34 | sklearn==0.0 35 | soupsieve==2.0 36 | typed-ast==1.4.1 37 | urllib3==1.26.5 38 | waitress==1.4.3 39 | Werkzeug==0.16.1 40 | wrapt==1.11.2 41 | zipp==3.1.0 42 | zope.interface==5.1.0 43 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/run.py: -------------------------------------------------------------------------------- 1 | """Run the flask server""" 2 | from app import APP 3 | 4 | # Run Server 5 | if __name__ == '__main__': 6 | APP.run(host='0.0.0.0', port=5000, debug=True, threaded=True) 7 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/run.wsgi: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import logging 4 | import sys 5 | logging.basicConfig(stream=sys.stderr) 6 | sys.path.insert(0, '/sce/webui') 7 | from run import APP as application 8 | application.secret_key = 'anything you wish' 9 | -------------------------------------------------------------------------------- /retired/sparkler-sce/webui/waitress_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run Flask App in production context 3 | """ 4 | 5 | from waitress import serve 6 | import run 7 | 8 | serve(run.APP, host='0.0.0.0', port=5000) 9 | -------------------------------------------------------------------------------- /retired/sparkler-ui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM uscdatascience/sce-domain-discovery:latest 2 | 3 | RUN echo 'hello' 4 | 5 | FROM ubuntu:bionic 6 | 7 | WORKDIR /usr/src/app 8 | 9 | RUN apt update && apt-get install -y sudo python3 python3-dev python3-pip docker.io apache2 libapache2-mod-wsgi-py3 curl && curl -sL https://deb.nodesource.com/setup_10.x | bash - && apt update && apt install -y nodejs 10 | 11 | #RUN adduser --disabled-password --gecos '' docker 12 | RUN adduser www-data sudo 13 | 14 | RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 15 | COPY package*.json ./ 16 | COPY scripts/run.sh / 17 | RUN npm ci --only=production 18 | COPY . . 19 | RUN npm run build && apt update && chmod +x /run.sh && a2enmod proxy && a2enmod proxy_http && mkdir /var/www/html/explorer && cp -rf build/* /var/www/html/explorer/ 20 | COPY scripts/000-default.conf /etc/apache2/sites-available/ 21 | 22 | EXPOSE 8080 23 | EXPOSE 80 24 | 25 | COPY --from=0 /projects/sce-domain-discovery /sce 26 | 27 | RUN cd /sce/webui && pip3 install -r requirements.txt && mkdir /images && mkdir /models && chown www-data:www-data /images && chown www-data:www-data /models && gpasswd -a www-data docker 28 | RUN ln -sf /dev/stdout /var/log/apache2/access.log \ 29 | && ln -sf /dev/stderr /var/log/apache2/error.log 30 | CMD [ "/run.sh" ] 31 | -------------------------------------------------------------------------------- /retired/sparkler-ui/database.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "database.arangodb.com/v1alpha" 2 | kind: "ArangoDeployment" 3 | metadata: 4 | name: "single-server" 5 | spec: 6 | mode: Single 7 | 8 | -------------------------------------------------------------------------------- /retired/sparkler-ui/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sce-ui", 3 | "version": "0.1.0", 4 | "private": true, 5 | "homepage": "/explorer/", 6 | "dependencies": { 7 | "@blueprintjs/core": "^3.14.1", 8 | "arangojs": "5", 9 | "axios": "^0.21.1", 10 | "dotenv": "^7.0.0", 11 | "js-file-download": "^0.4.10", 12 | "location-origin": "^1.1.4", 13 | "nano": "^8.0.1", 14 | "react": "^16.8.5", 15 | "react-dom": "^16.8.5", 16 | "react-grid-system": "^4.4.3", 17 | "react-iframe": "^1.7.11", 18 | "react-redux": "^7.0.1", 19 | "react-router-dom": "^5.0.0", 20 | "react-scripts": "2.1.8", 21 | "redux": "^4.0.1", 22 | "redux-thunk": "^2.3.0" 23 | }, 24 | "scripts": { 25 | "start": "react-scripts start", 26 | "build": "react-scripts build", 27 | "test": "react-scripts test", 28 | "eject": "react-scripts eject" 29 | }, 30 | "eslintConfig": { 31 | "extends": "react-app" 32 | }, 33 | "browserslist": [ 34 | ">0.2%", 35 | "not dead", 36 | "not ie <= 11", 37 | "not op_mini all" 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /retired/sparkler-ui/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-ui/public/favicon.ico -------------------------------------------------------------------------------- /retired/sparkler-ui/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 10 | 11 | 15 | 16 | 25 | React App 26 | 27 | 28 | 29 |
30 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /retired/sparkler-ui/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | } 10 | ], 11 | "start_url": ".", 12 | "display": "standalone", 13 | "theme_color": "#000000", 14 | "background_color": "#ffffff" 15 | } 16 | -------------------------------------------------------------------------------- /retired/sparkler-ui/scripts/000-default.conf: -------------------------------------------------------------------------------- 1 | 2 | # The ServerName directive sets the request scheme, hostname and port that 3 | # the server uses to identify itself. This is used when creating 4 | # redirection URLs. In the context of virtual hosts, the ServerName 5 | # specifies what hostname must appear in the request's Host: header to 6 | # match this virtual host. For the default virtual host (this file) this 7 | # value is not decisive as it is used as a last resort host regardless. 8 | # However, you must set it for any further virtual host explicitly. 9 | #ServerName www.example.com 10 | 11 | ServerAdmin webmaster@localhost 12 | DocumentRoot /var/www/html 13 | 14 | # Available loglevels: trace8, ..., trace1, debug, info, notice, warn, 15 | # error, crit, alert, emerg. 16 | # It is also possible to configure the loglevel for particular 17 | # modules, e.g. 18 | #LogLevel info ssl:warn 19 | 20 | ErrorLog ${APACHE_LOG_DIR}/error.log 21 | CustomLog ${APACHE_LOG_DIR}/access.log combined 22 | 23 | ProxyPreserveHost On 24 | ProxyPass http://sce-solr:8983/banana 25 | ProxyPassReverse http://sce-solr:8983/banana 26 | Order allow,deny 27 | Allow from all 28 | 29 | 30 | ProxyPreserveHost On 31 | ProxyPass http://sce-solr:8983/solr 32 | ProxyPassReverse http://sce-solr:8983/solr 33 | Order allow,deny 34 | Allow from all 35 | 36 | 37 | WSGIScriptAlias /explorer-api /sce/webui/run.wsgi 38 | WSGIDaemonProcess hello user=www-data group=www-data threads=5 39 | WSGIScriptReloading On 40 | 41 | WSGIProcessGroup hello 42 | WSGIApplicationGroup %{GLOBAL} 43 | Options Indexes FollowSymLinks MultiViews 44 | AllowOverride None 45 | Order allow,deny 46 | Allow from all 47 | Require all granted 48 | 49 | # For most configuration files from conf-available/, which are 50 | # enabled or disabled at a global level, it is possible to 51 | # include a line for only one particular virtual host. For example the 52 | # following line enables the CGI configuration for this host only 53 | # after it has been globally disabled with "a2disconf". 54 | #Include conf-available/serve-cgi-bin.conf 55 | 56 | 57 | # vim: syntax=apache ts=4 sw=4 sts=4 sr noet 58 | -------------------------------------------------------------------------------- /retired/sparkler-ui/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | exec apachectl -D FOREGROUND 3 | -------------------------------------------------------------------------------- /retired/sparkler-ui/src/App.css: -------------------------------------------------------------------------------- 1 | @import "~normalize.css"; 2 | @import "~@blueprintjs/core/lib/css/blueprint.css"; 3 | @import "~@blueprintjs/icons/lib/css/blueprint-icons.css"; 4 | 5 | .generatemodel > .bp3-input-group { 6 | display: inline-block !important; 7 | } 8 | .App { 9 | text-align: center; 10 | } 11 | 12 | .App-logo { 13 | animation: App-logo-spin infinite 20s linear; 14 | height: 40vmin; 15 | pointer-events: none; 16 | } 17 | 18 | .App-header { 19 | background-color: #282c34; 20 | min-height: 100vh; 21 | display: flex; 22 | flex-direction: column; 23 | align-items: center; 24 | justify-content: center; 25 | font-size: calc(10px + 2vmin); 26 | color: white; 27 | } 28 | 29 | .App-link { 30 | color: #61dafb; 31 | } 32 | 33 | @keyframes App-logo-spin { 34 | from { 35 | transform: rotate(0deg); 36 | } 37 | to { 38 | transform: rotate(360deg); 39 | } 40 | } 41 | 42 | .btn-circle { 43 | 44 | width: 30px; 45 | height: 30px; 46 | padding: 6px 0; 47 | border-radius: 15px; 48 | text-align: center; 49 | font-size: 12px; 50 | line-height: 1.428571429; 51 | 52 | } 53 | 54 | .green{ 55 | background-color: #0d8050; 56 | color: whitesmoke; 57 | } 58 | 59 | .amber{ 60 | background-color: #bf7326; 61 | color: whitesmoke; 62 | } 63 | 64 | .red{ 65 | background-color: #c23030; 66 | color: whitesmoke; 67 | } 68 | 69 | .btn-padding{ 70 | margin: 5px; 71 | } -------------------------------------------------------------------------------- /retired/sparkler-ui/src/App.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react'; 2 | import './App.css'; 3 | import Main from './views/Main' 4 | import Navbar from './components/Navbar' 5 | 6 | class App extends Component { 7 | 8 | render() { 9 | return ( 10 |
11 | 12 |
13 |
14 | ); 15 | } 16 | } 17 | 18 | export default App; 19 | -------------------------------------------------------------------------------- /retired/sparkler-ui/src/App.test.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | 5 | it('renders without crashing', () => { 6 | const div = document.createElement('div'); 7 | ReactDOM.render(, div); 8 | ReactDOM.unmountComponentAtNode(div); 9 | }); 10 | -------------------------------------------------------------------------------- /retired/sparkler-ui/src/actions/test.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-ui/src/actions/test.js -------------------------------------------------------------------------------- /retired/sparkler-ui/src/components/Banana.js: -------------------------------------------------------------------------------- 1 | import React, {Component} from 'react' 2 | import Iframe from "react-iframe"; 3 | 4 | class Banana extends Component { 5 | 6 | render() { 7 | return ( 8 |
9 |