├── .dockerignore
├── .github
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── codeql-analysis.yml
│ ├── dev-deploy.yaml
│ ├── docker-build.yml
│ └── release-deploy.yaml
├── .gitignore
├── .gitmodules
├── .gitpod.Dockerfile
├── .gitpod.yml
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── Release-Checklist.md
├── bin
├── dockler.sh
├── kickstart.sh
├── sce.sh
└── sparkler.sh
├── build.sbt
├── conf
├── domain-suffixes.xml
├── felix-config.properties
├── log4j.properties
├── log4j2.properties
├── regex-urlfilter.txt
├── solr-schema-map.yaml
├── solr
│ ├── crawldb
│ │ ├── conf
│ │ │ ├── _rest_managed.json
│ │ │ ├── currency.xml
│ │ │ ├── enumsConfig.xml
│ │ │ ├── lang
│ │ │ │ └── stopwords_en.txt
│ │ │ ├── managed-schema
│ │ │ ├── protwords.txt
│ │ │ ├── solrconfig.xml
│ │ │ ├── stopwords.txt
│ │ │ └── synonyms.txt
│ │ └── core.properties
│ ├── solr.xml
│ └── sparkler-jetty-context.xml
├── sparkler-default.yaml
└── user-agents.txt
├── docs
├── .gitignore
├── README.md
├── Sparkler-Dashboard.png
├── _config.yml
├── _includes
│ ├── disqus.html
│ ├── footer.html
│ ├── google_analytics.html
│ ├── header.html
│ └── navigation.html
├── _layouts
│ ├── default.html
│ └── page.html
├── _posts
│ ├── .gitkeep
│ ├── 2017-12-26-contributing-to-docs.md
│ └── 2017-12-26-development-environment-setup.md
├── bin
│ └── jekyll-page
├── changelog.md
├── css
│ ├── main.css
│ └── syntax.css
├── index.md
├── presentations
│ └── Sparkler-for-SparkSummitEast17.pdf
└── proposal
│ ├── Diagram.xml
│ └── Sparkler-Flow.svg
├── plugins.build.sbt
├── project
├── Dependencies.scala
├── PluginDependencies.scala
├── Settings.scala
├── build.properties
├── metals.sbt
├── plugins.sbt
└── project
│ └── metals.sbt
├── release.sh
├── retired
├── sparkler-sce
│ ├── LICENSE
│ ├── README.md
│ ├── compose
│ │ └── docker-compose.yaml
│ ├── deployment.yaml
│ ├── deployment
│ │ └── Dockerfile
│ ├── evaluation
│ │ └── phase1
│ │ │ ├── readme.md
│ │ │ └── relevancy-model.txt
│ └── webui
│ │ ├── Dockerfile
│ │ ├── app
│ │ ├── __init__.py
│ │ ├── apis
│ │ │ ├── __init__.py
│ │ │ ├── ns_classify.py
│ │ │ └── ns_search.py
│ │ ├── classifier
│ │ │ └── __init__.py
│ │ ├── controller.py
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ └── model.py
│ │ └── search
│ │ │ ├── __init__.py
│ │ │ └── fetcher.py
│ │ ├── config.py
│ │ ├── keywords.txt
│ │ ├── requirements.txt
│ │ ├── run.py
│ │ ├── run.wsgi
│ │ ├── wait-for-it.sh
│ │ └── waitress_server.py
└── sparkler-ui
│ ├── Dockerfile
│ ├── README.md
│ ├── database.yaml
│ ├── deployment.yaml
│ ├── package-lock.json
│ ├── package.json
│ ├── public
│ ├── favicon.ico
│ ├── index.html
│ └── manifest.json
│ ├── scripts
│ ├── 000-default.conf
│ └── run.sh
│ └── src
│ ├── App.css
│ ├── App.js
│ ├── App.test.js
│ ├── actions
│ ├── index.js
│ └── test.js
│ ├── components
│ ├── Banana.js
│ ├── IFrameFrame.js
│ ├── Navbar.js
│ ├── Previews.js
│ ├── RouterButton.js
│ ├── Solr.js
│ ├── VisitDashboard.js
│ ├── dialogs
│ │ ├── CrawlConfigDialog.js
│ │ ├── ModelDialog.js
│ │ ├── SeedURLDialog.js
│ │ └── StartCrawlDialog.js
│ ├── panels
│ │ ├── ListModels.js
│ │ └── NewModel.js
│ └── sidebar
│ │ ├── CrawlConfig.js
│ │ ├── CreateSeed.js
│ │ ├── ExportData.js
│ │ ├── ExportModel.js
│ │ ├── GenerateModel.js
│ │ ├── StartCrawl.js
│ │ └── StartCrawl_old.js
│ ├── index.css
│ ├── index.js
│ ├── list.csv
│ ├── logo.svg
│ ├── reducers
│ ├── index.js
│ ├── model.js
│ ├── search.js
│ ├── test.js
│ └── types.js
│ ├── serviceWorker.js
│ ├── utils
│ └── constants.js
│ └── views
│ ├── BananaFrame.js
│ ├── Home.js
│ ├── Main.js
│ └── SolrFrame.js
├── scalastyle-config.xml
├── sparkler-api
└── src
│ ├── main
│ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ ├── AbstractExtensionPoint.java
│ │ ├── Config.java
│ │ ├── ConfigKey.java
│ │ ├── Constants.java
│ │ ├── ExtensionChain.java
│ │ ├── ExtensionPoint.java
│ │ ├── Fetcher.java
│ │ ├── GenericProcess.java
│ │ ├── JobContext.java
│ │ ├── Scorer.java
│ │ ├── SparklerConfiguration.java
│ │ ├── SparklerException.java
│ │ ├── URLFilter.java
│ │ ├── URLNormalizer.java
│ │ ├── UrlInjectorObj.java
│ │ ├── model
│ │ ├── FetchedData.java
│ │ ├── MultiMap.java
│ │ ├── Resource.java
│ │ └── ResourceStatus.java
│ │ ├── storage
│ │ ├── FieldMapper.java
│ │ └── StringEvaluator.java
│ │ └── util
│ │ ├── CustomHttpRequestExecutor.java
│ │ ├── DomainSuffix.java
│ │ ├── DomainSuffixes.java
│ │ ├── DomainSuffixesReader.java
│ │ ├── FetcherDefault.java
│ │ ├── StreamTransformer.java
│ │ ├── StringUtil.java
│ │ ├── TestUtils.java
│ │ ├── TopLevelDomain.java
│ │ └── URLUtil.java
│ └── test
│ ├── java
│ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── util
│ │ └── FetcherDefaultTest.java
│ └── resources
│ ├── domain-suffixes.xml
│ ├── sparkler-default.yaml
│ └── user-agents.txt
├── sparkler-app
└── src
│ ├── assembly
│ └── dep.xml
│ ├── main
│ ├── resources
│ │ ├── domain-suffixes.xml
│ │ ├── log4j2.properties
│ │ ├── regex-urlfilter.txt
│ │ ├── solr-schema-map.yaml
│ │ ├── sparkler-default.yaml
│ │ └── user-agents.txt
│ └── scala
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ ├── Main.scala
│ │ ├── base
│ │ ├── CliTool.scala
│ │ ├── Loggable.scala
│ │ └── SparklerSink.scala
│ │ ├── model
│ │ ├── CrawlData.scala
│ │ ├── ParsedData.scala
│ │ └── SparklerJob.scala
│ │ ├── pipeline
│ │ ├── Crawler.scala
│ │ ├── CrawlerRunner.scala
│ │ ├── FairFetcher.scala
│ │ ├── FetchFunction.scala
│ │ ├── GenericFunction.scala
│ │ ├── OutLinkFilterFunction.scala
│ │ ├── ParseFunction.scala
│ │ ├── RunCrawl.scala
│ │ ├── ScoreFunction.scala
│ │ ├── SparklerProducer.scala
│ │ └── UrlInjectorFunction.scala
│ │ ├── service
│ │ ├── Dumper.scala
│ │ ├── Injector.scala
│ │ ├── PluginService.scala
│ │ └── RejectingURLFilterChain.scala
│ │ ├── storage
│ │ ├── ScoreUpdateTransformer.scala
│ │ ├── SparklerGroupPartition.scala
│ │ ├── StatusUpdate.scala
│ │ ├── StatusUpdateTransformer.scala
│ │ ├── StorageProxy.scala
│ │ ├── StorageProxyFactory.scala
│ │ ├── StorageRDD.scala
│ │ ├── Upserter.scala
│ │ ├── elasticsearch
│ │ │ ├── ElasticsearchDeepRDD.scala
│ │ │ ├── ElasticsearchProxy.scala
│ │ │ ├── ElasticsearchRDD.scala
│ │ │ ├── ElasticsearchResultIterator.scala
│ │ │ ├── ElasticsearchUpsert.scala
│ │ │ ├── ScoreUpdateElasticsearchTransformer.scala
│ │ │ └── StatusUpdateElasticsearchTransformer.scala
│ │ └── solr
│ │ │ ├── ContentHash.scala
│ │ │ ├── ScoreUpdateSolrTransformer.scala
│ │ │ ├── SolrDeepRDD.scala
│ │ │ ├── SolrProxy.scala
│ │ │ ├── SolrRDD.scala
│ │ │ ├── SolrResultIterator.scala
│ │ │ ├── SolrUpsert.scala
│ │ │ └── StatusUpdateSolrTransformer.scala
│ │ └── util
│ │ ├── FileDumperTool.scala
│ │ ├── HealthChecks.scala
│ │ ├── JobUtil.scala
│ │ └── NutchBridge.scala
│ └── test
│ └── resources
│ └── sparkler-default.yaml
├── sparkler-deployment
├── docker-k8s
│ └── Dockerfile
├── docker
│ ├── Dockerfile
│ ├── Dockerfile.solr
│ ├── conf
│ │ └── sparkler-default.yaml
│ ├── elasticsearch
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── docker-compose.yml
│ │ ├── dockler.py
│ │ └── scripts
│ │ │ ├── greeting.sh
│ │ │ └── start.sh
│ ├── jetty-csp-patch
│ │ └── jetty.xml
│ └── runsparkler.sh
├── helm
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── _helpers.tpl
│ │ └── deployment.yaml
│ ├── test.yaml
│ └── values.yaml
├── juju
│ ├── bundles
│ │ ├── sparkler-basic-spark
│ │ │ ├── README.md
│ │ │ └── bundle.yaml
│ │ └── sparkler-basic
│ │ │ ├── README.md
│ │ │ └── bundle.yaml
│ └── sparkler
│ │ ├── README.md
│ │ ├── actions.yaml
│ │ ├── actions
│ │ ├── addseedurls
│ │ ├── crawl
│ │ ├── inject
│ │ ├── removeallseedurls
│ │ └── removeseedurls
│ │ ├── config.yaml
│ │ ├── icon.svg
│ │ ├── layer.yaml
│ │ ├── metadata.yaml
│ │ ├── reactive
│ │ └── sparkler.py
│ │ ├── resources.yaml
│ │ └── wheelhouse.txt
├── sparkler-init
│ └── Dockerfile
└── sparkler-k8s
│ └── deployment.yaml
├── sparkler-plugins
├── databricks-api-plugin
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── kytheralabs
│ │ └── databricks
│ │ ├── DatabricksAPI.java
│ │ ├── DatabricksAPIActivator.java
│ │ └── Persistence.java
├── fetcher-chrome
│ └── src
│ │ └── main
│ │ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── plugin
│ │ ├── FetcherChrome.java
│ │ ├── FetcherChromeActivator.java
│ │ └── ProxySelector.java
├── fetcher-htmlunit
│ ├── README.md
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ ├── HtmlUnitFetcher.java
│ │ │ └── HtmlUnitFetcherActivator.java
│ │ └── test
│ │ ├── java
│ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ └── HtmlUnitFetcherTest.java
│ │ └── resources
│ │ └── log4j.properties
├── fetcher-jbrowser
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ ├── FetcherJBrowser.java
│ │ │ └── FetcherJBrowserActivator.java
│ │ └── test
│ │ ├── java
│ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ └── FetcherJBrowserTest.java
│ │ └── resources
│ │ └── log4j.properties
├── scorer-dd-svn
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ ├── DdSvnScorer.java
│ │ │ ├── DdSvnScorerActivator.java
│ │ │ └── ddsvn
│ │ │ └── ApacheHttpRestClient.java
│ │ └── test
│ │ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── plugin
│ │ └── DdSvnScorerTest.java
├── template-plugin
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ ├── MyPlugin.java
│ │ │ └── MyPluginActivator.java
│ │ └── test
│ │ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── plugin
│ │ └── MyPluginTest.java
├── url-injector
│ └── src
│ │ └── main
│ │ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── plugin
│ │ ├── UrlInjector.java
│ │ └── UrlInjectorActivator.java
├── urlfilter-regex
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ ├── RegexURLFilter.java
│ │ │ ├── RegexURLFilterActivator.java
│ │ │ └── regex
│ │ │ ├── RegexRule.java
│ │ │ └── RegexURLFilterBase.java
│ │ └── test
│ │ ├── java
│ │ └── edu
│ │ │ └── usc
│ │ │ └── irds
│ │ │ └── sparkler
│ │ │ └── plugin
│ │ │ └── RegexURLFilterTest.java
│ │ └── resources
│ │ └── regex-urlfilter.txt
└── urlfilter-samehost
│ └── src
│ ├── main
│ └── java
│ │ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── plugin
│ │ ├── UrlFilterSameHost.java
│ │ └── UrlFilterSameHostActivator.java
│ └── test
│ └── java
│ └── edu
│ └── usc
│ └── irds
│ └── sparkler
│ └── plugin
│ └── UrlFilterSameHostTest.java
├── sparkler-tests-base
└── src
│ └── main
│ ├── java
│ └── edu
│ │ └── usc
│ │ └── irds
│ │ └── sparkler
│ │ └── test
│ │ ├── TestSlaveServlet.java
│ │ ├── WebServer.java
│ │ └── WebServerRunListener.java
│ └── resources
│ └── webapp
│ ├── index.html
│ └── jspage.html
└── version.sbt
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # ignore everything
20 | *
21 |
22 | # except these
23 | !build
24 | !sparkler-ui/
25 | !conf
26 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | #### Issue Description
2 |
3 | Please describe our issue, along with:
4 | - expected behavior
5 | - encountered behavior
6 |
7 | #### How to reproduce it
8 | If you are describing a bug, please describe here how to reproduce it.
9 |
10 | #### Environment and Version Information
11 |
12 | Please indicate relevant versions, including, if relevant:
13 |
14 | * Java Version
15 | * Spark Version
16 | * Operating System name and version
17 |
18 |
19 | #### An external links for reference
20 |
21 | If you think any other resources on internet will be helpful to understand and/or resolve this issue, please share them here.
22 |
23 | #### Contributing
24 |
25 | If you'd like to help us fix the issue by contributing some code, but would like guidance or help in doing so, please mention it!
26 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## What changes were proposed in this pull request?
2 |
3 | (Please fill in changes proposed in this fix)
4 |
5 | **Is this related to an already existing issue on sparkler?**
6 | If so, mention that issue by referencing its number here.
7 |
8 | **Will it close an existing issue?**
9 | Say 'Closes #IssueNum' here.
10 |
11 |
12 | ### How was this patch tested?
13 |
14 | We are particularly interested in unit tests, integration tests, manual tests you did to ensure that the patch works as expected, so briefly describe them.
15 |
16 |
17 | Please review
18 | https://github.com/USCDataScience/sparkler/blob/master/.github/CONTRIBUTING.md before opening a pull request.
19 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 |
3 | env:
4 | REGISTRY: ghcr.io
5 | IMAGE_NAME: ${{ github.repository }}/sparkler
6 |
7 | on:
8 | push:
9 | branches: [ main ]
10 | pull_request:
11 | branches: [ main ]
12 |
13 | jobs:
14 |
15 | build:
16 |
17 | runs-on: ubuntu-latest
18 |
19 | steps:
20 | - uses: actions/checkout@v2
21 | - name: Log in to the Container registry
22 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
23 | with:
24 | registry: ${{ env.REGISTRY }}
25 | username: ${{ github.actor }}
26 | password: ${{ secrets.GITHUB_TOKEN }}
27 | - name: Extract metadata (tags, labels) for Docker
28 | id: meta
29 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
30 | with:
31 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
32 | - name: Build and push Docker image
33 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
34 | with:
35 | context: sparkler-core
36 | push: true
37 | tags: ${{ steps.meta.outputs.tags }}
38 | labels: ${{ steps.meta.outputs.labels }}
39 | file: sparkler-deployment/docker/Dockerfile
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Custom
17 | .bloop
18 | **/scalastyle-config
19 | **/.scalafmt.conf
20 | **/.metals
21 | **/.bsp
22 |
23 | # Standard Java
24 | *.class
25 | *.log
26 | *.jar
27 | *.war
28 | core/build/**
29 |
30 | # sbt specific
31 | .cache
32 | .history
33 | .lib/
34 | dist/*
35 | target/
36 | lib_managed/
37 | src_managed/
38 | project/boot/
39 | project/plugins/project/
40 |
41 | # Scala-IDE specific
42 | .scala_dependencies
43 | .worksheet
44 | **~
45 | sparkler-job-**
46 | .idea/
47 | *.iml
48 |
49 | # Eclipse-IDE specific
50 | .classpath
51 | .project
52 | .cache-main
53 | .settings/
54 |
55 | . auto generated files
56 | sjob-**
57 |
58 | # MAC Files
59 | .DS_Store
60 |
61 | # Application Files
62 | /resources
63 | felix-cache/
64 |
65 | tmp*
66 | workspace*
67 | sparkler-core/build/*
68 | sparkler-ui/node_modules/*
69 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/.gitmodules
--------------------------------------------------------------------------------
/.gitpod.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM registry.gitlab.com/spiculedata/custom-gitpod-full:latest
2 |
3 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.0-linux-x86_64.tar.gz && \
4 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.0-linux-x86_64.tar.gz.sha512 && \
5 | shasum -a 512 -c elasticsearch-7.17.0-linux-x86_64.tar.gz.sha512 && \
6 | tar -xzf elasticsearch-7.17.0-linux-x86_64.tar.gz
7 |
--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | image:
2 | file: .gitpod.Dockerfile
3 |
4 | tasks:
5 | - init: sbt assembly package
6 | command: cd /home/gitpod/elasticsearch-7.17.0/ && ./bin/elasticsearch -d -p pid
7 | ports:
8 | - port: 8080
9 | visibility: public
10 | onOpen: open-browser
11 | - port: 9200
12 | visibility: public
13 | onOpen: open-browser
14 | github:
15 | prebuilds:
16 | # enable for the default branch (defaults to true)
17 | master: true
18 | # enable for all branches in this repo (defaults to false)
19 | branches: true
20 | # enable for pull requests coming from this repo (defaults to true)
21 | pullRequests: true
22 | # enable for pull requests coming from forks (defaults to false)
23 | pullRequestsFromForks: false
24 | # add a check to pull requests (defaults to true)
25 | addCheck: true
26 | # add a "Review in Gitpod" button as a comment to pull requests (defaults to false)
27 | addComment: true
28 | # add a "Review in Gitpod" button to the pull request's description (defaults to false)
29 | addBadge: true
30 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "files.watcherExclude": {
3 | "**/target": true
4 | }
5 | }
--------------------------------------------------------------------------------
/Release-Checklist.md:
--------------------------------------------------------------------------------
1 |
2 | # Release checklist
3 |
4 | Contributors: Thamme Gowda
5 |
6 |
7 | ## Update Versions
8 |
9 | Update the version to $VERSION=x.y.z
10 |
11 | mvn versions:set -DnewVersion=$VERSION
12 | mvn versions:commit
13 |
14 | ## Build the project
15 |
16 | mvn clean package
17 |
18 |
19 | ## Build docker image
20 |
21 | docker build -f sparkler-deployment/docker/Dockerfile . -t sparkler-local
22 |
23 |
24 | ### Push docker image to Docker Hub
25 |
26 | docker login
27 | # the account should have push permission to repo https://hub.docker.com/r/uscdatascience/sparkler
28 |
29 | docker tag sparkler-local uscdatascience/sparkler:$VERSION
30 | docker push uscdatascience/sparkler:$VERSION
31 |
32 | # Also make it as local
33 | docker tag sparkler-local uscdatascience/sparkler:latest
34 | docker push uscdatascience/sparkler:latest
35 |
36 |
37 | ## Release Jars to Maven Central
38 |
39 | TODO: complete this
40 | It is work in progress https://issues.sonatype.org/browse/OSSRH-36816
41 |
--------------------------------------------------------------------------------
/bin/sparkler.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Attempt to resolve the sparkler jar using relative paths
4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | DIR="$DIR/.."
6 |
7 | JAR=`echo $DIR/sparkler-app-*-SNAPSHOT.jar`
8 | if [ -f "$JAR" ]
9 | then
10 | # run
11 | # -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
12 | java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$DIR/plugins edu.usc.irds.sparkler.Main $@
13 | exit 0
14 | fi
15 |
16 | # Attempt to resolve the sparkler jar using absolute paths
17 | # We do this because in the elastic-search deployment we add sparkler.sh to /usr/bin
18 | # In that case the Sparkler jar cannot be resolved via relative paths.
19 | # The followig code block resolves the absolute location of this script on disk
20 | # We assume that it is located in sparkler-core/bin/
21 | SOURCE="${BASH_SOURCE[0]}"
22 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
23 | DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
24 | SOURCE="$(readlink "$SOURCE")"
25 | [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
26 | done
27 | DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
28 |
29 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
30 | SPARKLER_BUILD_DIR="$DIR/../build"
31 | JAR=`echo $DIR/../sparkler-app-*/lib`
32 | #if [ ! -f "$JAR" ]
33 | # then
34 | # echo "ERROR: Can't find Sparkler Jar at $JAR.
35 | # Looks like the jar is not built. Please refer to build instructions. Or see ./dockler.sh"
36 | # exit 2
37 | #fi
38 |
39 | # run
40 | # debugging lines
41 | # -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
42 | #java -Xms1g -cp $DIR/../conf:$JAR/* -Dpf4j.pluginsDir=$DIR/../plugins edu.usc.irds.sparkler.Main $@
43 | java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$SPARKLER_BUILD_DIR/plugins edu.usc.irds.sparkler.Main $@
44 |
--------------------------------------------------------------------------------
/conf/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | status = error
19 | name = PropertiesConfig
20 |
21 | filters = threshold
22 |
23 | filter.threshold.type = ThresholdFilter
24 | filter.threshold.level = INFO
25 |
26 | appenders = console
27 |
28 | appender.console.type = Console
29 | appender.console.name = STDOUT
30 | appender.console.layout.type = PatternLayout
31 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
32 |
33 | rootLogger.level = warn
34 | rootLogger.appenderRefs = stdout
35 | rootLogger.appenderRef.stdout.ref = STDOUT
36 | logger.irds.name = edu.usc.irds
37 | logger.irds.level=DEBUG
38 |
39 | #rootLogger.level = INFO
40 | #rootLogger.appenderRefs = STDOUT
41 | #rootLogger.appenderRef.stdout.ref = STDOUT
42 | #logger.irds.name = edu.usc.irds
43 | #logger.irds.level=DEBUG
44 | #logger.kythera.name = com.kytheralabs
45 | #logger.kythera.level = DEBUG
46 | #logger.spicule.name = uk.co.spicule
47 | #logger.spicule.level = DEBUG
--------------------------------------------------------------------------------
/conf/log4j2.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 |
19 | status = error
20 | name = PropertiesConfig
21 |
22 | filters = threshold
23 |
24 | filter.threshold.type = ThresholdFilter
25 | filter.threshold.level = INFO
26 |
27 | appenders = console
28 |
29 | appender.console.type = Console
30 | appender.console.name = STDOUT
31 | appender.console.layout.type = PatternLayout
32 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
33 |
34 | #rootLogger.level = INFO
35 | #rootLogger.appenderRefs = STDOUT
36 | #rootLogger.appenderRef.stdout.ref = STDOUT
37 | #logger.irds.name = edu.usc.irds
38 | #logger.irds.level=DEBUG
39 | #logger.kythera.name = com.kytheralabs
40 | #logger.kythera.level = DEBUG
41 | #logger.spicule.name = uk.co.spicule
42 | #logger.spicule.level = DEBUG
--------------------------------------------------------------------------------
/conf/regex-urlfilter.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | # The default url filter.
18 | # Better for whole-internet crawling.
19 |
20 | # Each non-comment, non-blank line contains a regular expression
21 | # prefixed by '+' or '-'. The first matching pattern in the file
22 | # determines whether a URL is included or ignored. If no pattern
23 | # matches, the URL is ignored.
24 |
25 | # skip file: ftp: and mailto: urls
26 | -^(file|ftp|mailto):
27 |
28 | # Default: skip image and other suffixes which produces large content
29 | # for a more extensive coverage use the urlfilter-suffix plugin
30 | -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4)$
31 |
32 | # skip URLs containing certain characters as probable queries, etc.
33 | #-[?*!@=]
34 |
35 | # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
36 | -.*(/[^/]+)/[^/]+\1/[^/]+\1/
37 |
38 | # accept any HTTP URL
39 | +^https?://
40 |
41 | # reject the rest
42 | -.
43 |
--------------------------------------------------------------------------------
/conf/solr-schema-map.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | ################## Solr Schema Map Properties #######################
17 |
18 | #overrides:
19 | # id: id
20 |
21 | typeSuffix:
22 | java.lang.String: _t
23 | java.lang.Integer: _i
24 | java.lang.Long: _l
25 | java.lang.Boolean: _b
26 | java.lang.Float: _f
27 | java.lang.Double: _d
28 | java.util.Date: _dt
29 |
30 | multiValSuffix: s
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/_rest_managed.json:
--------------------------------------------------------------------------------
1 | {"initArgs":{},"managedList":[]}
2 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/enumsConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 |
21 | UNFETCHED
22 | FETCHING
23 | FETCHED
24 | IGNORED
25 | ERROR
26 |
27 |
28 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 |
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/protwords.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 |
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 |
22 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/stopwords.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | #Standard english stop words taken from Lucene's StopAnalyzer
17 | a
18 | an
19 | and
20 | are
21 | as
22 | at
23 | be
24 | but
25 | by
26 | for
27 | if
28 | in
29 | into
30 | is
31 | it
32 | no
33 | not
34 | of
35 | on
36 | or
37 | s
38 | such
39 | t
40 | that
41 | the
42 | their
43 | then
44 | there
45 | these
46 | they
47 | this
48 | to
49 | was
50 | will
51 | with
52 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 |
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 |
27 | # Synonym mappings can be used for spelling correction too
28 | pixima => pixma
29 |
30 |
--------------------------------------------------------------------------------
/conf/solr/crawldb/core.properties:
--------------------------------------------------------------------------------
1 | #Written by CorePropertiesLocator
2 | #Thu Oct 06 05:54:35 UTC 2016
3 | name=crawldb
4 |
--------------------------------------------------------------------------------
/conf/solr/solr.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
27 |
28 |
29 |
30 |
31 |
32 | ${host:}
33 | ${jetty.port:8983}
34 | ${hostContext:solr}
35 |
36 | ${genericCoreNodeNames:true}
37 |
38 | ${zkClientTimeout:30000}
39 | ${distribUpdateSoTimeout:600000}
40 | ${distribUpdateConnTimeout:60000}
41 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider}
42 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider}
43 |
44 |
45 |
46 |
48 | ${socketTimeout:600000}
49 | ${connTimeout:60000}
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/conf/solr/sparkler-jetty-context.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | /solr-webapp/sparkler
6 | false
7 |
8 |
--------------------------------------------------------------------------------
/conf/user-agents.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # User agents to be used
17 | # Each line contains an agent
18 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client1
19 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client2
20 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client3
21 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client4
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | _site
3 | _pages
4 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Sparkler Docs
2 |
3 | Read the docs at http://irds.usc.edu/sparkler
4 |
--------------------------------------------------------------------------------
/docs/Sparkler-Dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/Sparkler-Dashboard.png
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | # Site title and subtitle. This is used in _includes/header.html
2 | title: 'Sparkler'
3 | subtitle: 'Spark Crawler'
4 |
5 | # if you wish to integrate disqus on pages set your shortname here
6 | disqus_shortname: 'Sparkler'
7 |
8 | # if you use google analytics, add your tracking id here
9 | google_analytics_id: 'UA-77850818-1'
10 |
11 | # Enable/show navigation. There are there options:
12 | # 0 - always hide
13 | # 1 - always show
14 | # 2 - show only if posts are present
15 | navigation: 1
16 |
17 | # URL to source code, used in _includes/footer.html
18 | codeurl: 'https://github.com/USCDataScience/sparkler'
19 |
20 | # Default categories (in order) to appear in the navigation
21 | sections: [
22 | ['doc', 'Documentation'],
23 | ['tut', 'Tutorial'],
24 | ['ref', 'Reference'],
25 | ['dev', 'Developers'],
26 | ['post', 'Posts']
27 | ]
28 |
29 | # Keep as an empty string if served up at the root. If served up at a specific
30 | # path (e.g. on GitHub pages) leave off the trailing slash, e.g. /my-project
31 | baseurl: '/sparkler'
32 |
33 | # Dates are not included in permalinks
34 | permalink: none
35 |
36 | # Syntax highlighting
37 | highlighter: rouge
38 |
39 | # Since these are pages, it doesn't really matter
40 | future: true
41 |
42 | # Exclude non-site files
43 | exclude: ['bin', 'README.md', 'presentations', 'proposal', 'Sparkler-Dashboard.png']
44 |
45 | # Use the kramdown Markdown renderer
46 | markdown: kramdown
47 | redcarpet:
48 | extensions: [
49 | 'no_intra_emphasis',
50 | 'fenced_code_blocks',
51 | 'autolink',
52 | 'strikethrough',
53 | 'superscript',
54 | 'with_toc_data',
55 | 'tables',
56 | 'hardwrap'
57 | ]
58 |
--------------------------------------------------------------------------------
/docs/_includes/disqus.html:
--------------------------------------------------------------------------------
1 |
2 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/_includes/footer.html:
--------------------------------------------------------------------------------
1 | Documentation for {{ site.title }}
2 |
--------------------------------------------------------------------------------
/docs/_includes/google_analytics.html:
--------------------------------------------------------------------------------
1 |
10 |
--------------------------------------------------------------------------------
/docs/_includes/header.html:
--------------------------------------------------------------------------------
1 | {{ site.title }}
2 | {% if site.subtitle %}{{ site.subtitle }}{% endif %}
3 |
4 |
--------------------------------------------------------------------------------
/docs/_includes/navigation.html:
--------------------------------------------------------------------------------
1 |
2 | - Home
3 | {% for section in site.sections %}
4 | {% assign attr = section[0] %}
5 | {% assign label = section[1] %}
6 |
7 | {% for page in site.categories[attr] %}
8 | {% if forloop.first %}
9 |
10 | {% endif %}
11 | - {{ page.title }}
12 | {% endfor %}
13 | {% endfor %}
14 |
16 |
17 |
--------------------------------------------------------------------------------
/docs/_layouts/page.html:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 |
5 |
10 |
11 | {{ content }}
12 |
--------------------------------------------------------------------------------
/docs/_posts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/_posts/.gitkeep
--------------------------------------------------------------------------------
/docs/_posts/2017-12-26-contributing-to-docs.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Contributing to Docs"
4 | category: dev
5 | date: 2017-12-26 15:27:29
6 | ---
7 |
8 | Contributions are welcome all the way - big or small, including adding tutorials and how-to's!
9 |
10 | This page helps how to update documentation.
11 |
12 | To add a new page to this website
13 |
14 | ```bash
15 | ruby bin/jekyll-page "Page Title"
16 | ```
17 |
18 | `` can be:
19 |
20 | - `doc` - Documentation
21 | - `tut` - Tutorial
22 | - `ref` - Reference
23 | - `dev` - Developers
24 | - `post` - Posts
25 |
26 | For example, if you want to write a tutorial about **Crawling images using Sparkler**
27 |
28 |
29 |
30 | ```bash
31 | ruby bin/jekyll-page "Crawling Images using Sparkler" tut
32 | ```
33 |
34 | Then edit the markdown file under `_posts/` directory.
35 |
36 | Then follow the standard github contribution guideline.
37 | If not already, fork this project from [https://github.com/USCDataScience/sparkler](https://github.com/USCDataScience/sparkler) to https://github.com//sparkler
38 |
39 | ```bash
40 | git remote add own git@github.com//sparkler
41 | git add docs/_posts/*
42 | git commit -m 'Added documentation for ___'
43 | git push own
44 | ```
45 |
46 | Then raise a pull request at [https://github.com/USCDataScience/sparkler](https://github.com/USCDataScience/sparkler) using the github web UI.
47 |
48 | Contact developers on [slack](/sparkler/#slack) if you have questions.
49 |
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/changelog.md
--------------------------------------------------------------------------------
/docs/css/main.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-weight: 400;
3 | text-shadow: 0 1px 1px rgba(255, 255, 255, 0.7);
4 | }
5 |
6 | pre, code, pre code {
7 | border: none;
8 | border-radius: 0;
9 | background-color: #f9f9f9;
10 | font-size: 0.85em;
11 | }
12 |
13 | .highlight {
14 | background-color: #f9f9f9;
15 | }
16 |
17 | pre {
18 | font-size: 1em;
19 | }
20 |
21 | code {
22 | color: inherit;
23 | }
24 |
25 | #header {
26 | border-bottom: 1px solid #eee;
27 | margin-bottom: 20px;
28 | }
29 |
30 | #header a:hover {
31 | text-decoration: none;
32 | }
33 |
34 | #footer {
35 | margin: 20px 0;
36 | font-size: 0.85em;
37 | color: #999;
38 | text-align: center;
39 | }
40 |
41 | #content > .page-header:first-child {
42 | margin-top: 0;
43 | }
44 |
45 | #content > .page-header:first-child h2 {
46 | margin-top: 0;
47 | }
48 |
49 |
50 | #navigation {
51 | font-size: 0.9em;
52 | }
53 |
54 | #navigation li a {
55 | padding-left: 10px;
56 | padding-right: 10px;
57 | }
58 |
59 | #navigation .nav-header {
60 | padding-left: 0;
61 | padding-right: 0;
62 | }
63 |
64 | body.rtl {
65 | direction: rtl;
66 | }
67 |
68 | body.rtl #header .brand {
69 | float: right;
70 | margin-left: 5px;
71 | }
72 | body.rtl .row-fluid [class*="span"] {
73 | float: right !important;
74 | margin-left: 0;
75 | margin-right: 2.564102564102564%;
76 | }
77 | body.rtl .row-fluid [class*="span"]:first-child {
78 | margin-right: 0;
79 | }
80 |
81 | body.rtl ul, body.rtl ol {
82 | margin: 0 25px 10px 0;
83 | }
84 |
85 | table {
86 | margin-bottom: 1rem;
87 | border: 1px solid #e5e5e5;
88 | border-collapse: collapse;
89 | }
90 |
91 | td, th {
92 | padding: .25rem .5rem;
93 | border: 1px solid #e5e5e5;
94 | }
95 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: "USC-IRDS Sparkler Documentation"
4 | ---
5 |
6 | **Sparkler** is a modern crawler powered by Apache Spark.
7 |
8 |
9 | ## Getting Help
10 |
11 | ### Using GitHub effectively
12 | If you have caught a bug, or need a feature, please create an issue on github: https://github.com/USCDataScience/sparkler/issues/new
13 |
14 |
15 | ### [Using Slack Channel](#slack)
16 | Two step process:
17 | 1. Join the Slack group using [this invitation link](https://join.slack.com/t/uscdatascience/shared_invite/enQtMjkwMTMzMDA2MTMxLWQwZjAyYTA3MDc4MjkyOTZlNzEyNzkxMGU3MzY5MWM0NDdmNWE1MmQxMWUwZjU0YWViMzBjNzg0YTM0NzE5ODg)
18 | 2. Navigate to the **Sparkler** channel located at [https://uscdatascience.slack.com/messages/sparkler](https://uscdatascience.slack.com/messages/sparkler)
19 |
20 | ### Using Mailing List
21 | Send your questions to the mailing list irds-l@usc.edu
22 |
23 | ## Developers and Contributors
24 |
25 | [Full list is here](https://github.com/USCDataScience/sparkler/graphs/contributors)
26 |
27 |
28 | ---
29 |
30 | ## Contributing
31 | - [Modifications to Source Code](/sparkler/dev/development-environment-setup.html#contributing-source)
32 | - [Updating documentation](/sparkler/dev/contributing-to-docs.html)
33 |
34 |
--------------------------------------------------------------------------------
/docs/presentations/Sparkler-for-SparkSummitEast17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/docs/presentations/Sparkler-for-SparkSummitEast17.pdf
--------------------------------------------------------------------------------
/project/PluginDependencies.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import sbt._
19 |
20 | // Define global plugin dependencies here
21 | object PluginDependencies {}
22 |
23 | object FetcherChrome {
24 | object Selenium {
25 | private val group = "org.seleniumhq.selenium"
26 | private val version = "3.141.59"
27 | lazy val chromeDriver = group % "selenium-chrome-driver" % version
28 | lazy val java = group % "selenium-java" % version
29 | }
30 | lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
31 | lazy val seleniumscripter = "uk.co.spicule" % "seleniumscripter" % "1.7.9"
32 | lazy val magnesium_script = "uk.co.spicule" % "magnesium-script" % "0.2.0"
33 | }
34 |
35 | object FetcherHtmlUnit {
36 | lazy val htmlUnit = "net.sourceforge.htmlunit" % "htmlunit" % "2.26"
37 | }
38 |
39 | object FetcherJBrowser {
40 | lazy val jBrowser = "com.machinepublishers" % "jbrowserdriver" % "0.16.4"
41 | }
42 |
43 | object ScorerDdSvn {
44 | lazy val httpClient = "org.apache.httpcomponents" % "httpclient" % "4.3.6"
45 | }
46 |
47 | object Databricks {
48 | lazy val wrapper = "com.kytheralabs" % "webcrawlerwrapper_2.12" % "0.1-SNAPSHOT"
49 | }
50 |
51 | object UrlFilterSameHost {
52 | lazy val guava = "com.google.guava" % "guava" % "31.0.1-jre"
53 | }
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.5.0
2 |
--------------------------------------------------------------------------------
/project/metals.sbt:
--------------------------------------------------------------------------------
1 | // DO NOT EDIT! This file is auto-generated.
2 |
3 | // This file enables sbt-bloop to create bloop config files.
4 |
5 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.10-8-8d1cbc4f")
6 |
7 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
18 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4")
19 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
20 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
21 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
22 | addSbtPlugin("com.gilcloud" % "sbt-gitlab" % "0.0.6")
--------------------------------------------------------------------------------
/project/project/metals.sbt:
--------------------------------------------------------------------------------
1 | // DO NOT EDIT! This file is auto-generated.
2 |
3 | // This file enables sbt-bloop to create bloop config files.
4 |
5 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.10-8-8d1cbc4f")
6 |
7 |
--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Script : release.sh
18 | # Usage : ./release.sh
19 | # Description: Release Sparkler Silently - Create tag with version in version.sbt and bump it
20 |
21 | sbt clean package test && sbt releaseSilent
22 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/README.md:
--------------------------------------------------------------------------------
1 | # polar-domain-discovery
2 | Domain Discovery on Any Domain
3 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/compose/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | single-server-int:
4 | networks:
5 | - sparkler
6 | image: arangodb:3.4.6
7 | ports:
8 | - 8529:8529
9 | environment:
10 | - ARANGO_NO_AUTH=1
11 | volumes:
12 | - db:/var/lib/arangodb3
13 | sce-solr:
14 | image: uscdatascience/sparkler-solr:latest
15 | networks:
16 | - sparkler
17 | ports:
18 | - "8983:8983"
19 | volumes:
20 | - data:/opt/solr/server/solr/mycores
21 | command: ['/data/solr/bin/solr','start','-f']
22 |
23 | sce-splash:
24 | image: scrapinghub/splash
25 | networks:
26 | - sparkler
27 | ports:
28 | - 8050:8050
29 |
30 | sce-ui:
31 | image: uscdatascience/sparkler-ui:latest
32 | networks:
33 | - sparkler
34 | ports:
35 | - "8080:80"
36 | volumes:
37 | - /var/run/docker.sock:/var/run/docker.sock
38 | - models:/models
39 | volumes:
40 | data:
41 | db:
42 | models:
43 |
44 | networks:
45 | sparkler:
46 | external:
47 | name: sparkler
48 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/deployment/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3-buster
2 | RUN apt update && apt install -y docker.io
3 |
4 | WORKDIR /projects/sce-domain-discovery/webui
5 |
6 | COPY webui/requirements.txt /projects/sce-domain-discovery/webui/
7 |
8 | RUN pip install -r requirements.txt && mkdir /models && mkdir /images
9 |
10 | COPY . /projects/sce-domain-discovery/
11 |
12 |
13 | CMD ["python", "waitress_server.py"]
14 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/evaluation/phase1/readme.md:
--------------------------------------------------------------------------------
1 | **Seed Exploration URLs**
2 |
3 | https://docs.google.com/spreadsheets/d/1rbSE1v8Cu9_NQYpKtquvgdLUzJHxkWoOdR6rxs4u7Gg/edit#gid=0
4 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3-buster
2 | RUN apt update && apt install -y docker.io
3 |
4 | WORKDIR /projects/sce-domain-discovery/webui
5 |
6 | COPY webui/requirements.txt /projects/sce-domain-discovery/webui/
7 |
8 | RUN pip install -r requirements.txt && mkdir /models && mkdir /images
9 |
10 | COPY . /projects/sce-domain-discovery/
11 |
12 |
13 | CMD ["python", "waitress_server.py"]
14 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/app/__init__.py:
--------------------------------------------------------------------------------
1 | """Bootstrap the API"""
2 |
3 | import logging
4 |
5 | from flask import Flask
6 | # Import a module / component using its blueprint handler variable
7 | from app.controller import MOD_APP as app_module
8 | # Import flask and template operators
9 | from app.apis import API_OBJ
10 |
11 | # Define the WSGI application object
12 | APP = Flask(__name__,
13 | static_url_path='',
14 | static_folder='static')
15 |
16 | logging.basicConfig(level=logging.DEBUG)
17 |
18 | # Configurations
19 | APP.config.from_object('config')
20 |
21 |
22 | # Register blueprint(s)
23 | APP.register_blueprint(app_module)
24 |
25 |
26 | # Initialize flask-restplus
27 | API_OBJ.init_app(APP)
28 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/app/apis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Setup the API and add the required namespaces
3 | """
4 |
5 | from flask_restplus import Api
6 |
7 | from app.apis.ns_search import API as search_api
8 | from app.apis.ns_classify import API as classify_api
9 |
10 | API_OBJ = Api(title='Seed Generation', version='1.0',
11 | description='Tool to generate seeds for Domain Discovery', doc='/doc')
12 |
13 | API_OBJ.add_namespace(search_api)
14 | API_OBJ.add_namespace(classify_api)
15 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/app/apis/ns_classify.py:
--------------------------------------------------------------------------------
1 | """
2 | Classify Endpoints for the REST API.
3 | """
4 | import json
5 | import os
6 |
7 | from flask_restplus import Namespace, Resource
8 | from flask import request
9 | from app import classifier
10 |
11 | PFX = os.getenv('API_PFX', '')
12 |
13 |
14 | API = Namespace('classify', description='Interact with the ML model', path=PFX+'/classify')
15 |
16 |
17 | @API.route('/predict', methods=['GET', 'POST'])
18 | class Predict(Resource):
19 | """Predict a result"""
20 | @API.doc('predict')
21 | @staticmethod
22 | def get(content, model):
23 | """
24 | Predict using ML model
25 | :param content:
26 | :param model:
27 | :return:
28 | """
29 | classes = {
30 | -1: 'Model doesn\'t exist',
31 | 0: 'Not Relevant',
32 | 1: 'Relevant',
33 | 2: 'Highly Relevant'
34 | }
35 | args = request.args
36 | if len(args) != 0:
37 | content = args['content']
38 | if content:
39 | result = classifier.predict(model, content)
40 | return classes[result]
41 | print('NO CONTENT FOUND')
42 | return classes[-1]
43 |
44 | @API.doc('predict')
45 | @staticmethod
46 | def post():
47 | """
48 | Predict using ML model
49 | :return:
50 | """
51 | classes = {
52 | -1: 'Model doesn\'t exist',
53 | 0: 'Not Relevant',
54 | 1: 'Relevant',
55 | 2: 'Highly Relevant'
56 | }
57 | result = -1
58 | data = request.data
59 | loaded_data = json.loads(data.decode('utf-8', 'ignore'))
60 | if len(data) != 0:
61 | content = loaded_data['score'][0]['content']
62 | if content is None:
63 | return classes[-1]
64 |
65 | model = loaded_data['score'][0]['model']
66 | result = classifier.predict(model, content)
67 | return classes[result]
68 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/app/apis/ns_search.py:
--------------------------------------------------------------------------------
1 | """
2 | Search Endpoints for the REST API
3 | """
4 | import json
5 | import os
6 |
7 | from flask_restplus import Namespace, Resource, cors
8 | from flask import current_app as a
9 | from pyArango.theExceptions import DocumentNotFoundError
10 | from werkzeug.exceptions import BadRequest
11 | from app import search
12 |
13 | PFX = os.getenv('API_PFX', '')
14 |
15 | API = Namespace('search', description='Query Duck Duck Go for results', path=PFX+'/search')
16 |
17 |
18 | @API.route('//')
19 | @API.param('query', 'Query string to search')
20 | class Search(Resource):
21 | """ Search a resource """
22 | @classmethod
23 | @API.doc('search')
24 | @cors.crossdomain(origin='*')
25 | def get(cls, model, query):
26 | """
27 | Search Duck Duck Go
28 | :param model:
29 | :param query:
30 | :return:
31 | """
32 | a.logger.debug('Search Called!')
33 | try:
34 | url_details = search.query_and_fetch(query, model, top_n=12)
35 | except DocumentNotFoundError as exception:
36 | print(exception)
37 | raise BadRequest('Model Not Found')
38 |
39 | return json.dumps(url_details)
40 |
41 |
42 | @API.route('///')
43 | @API.param('query', 'Query string to search')
44 | @API.param('page', 'Results Page')
45 | class SearchPaginated(Resource):
46 | """Execute a paginated search"""
47 | @classmethod
48 | @API.doc('searchpaginated')
49 | @cors.crossdomain(origin='*')
50 | def get(cls, model, query, page):
51 | """
52 | Search Duck Duck Go
53 | :param model:
54 | :param query:
55 | :param page:
56 | :return:
57 | """
58 | a.logger.debug('Paged Search Called!')
59 | try:
60 | url_details = search.query_and_fetch(query, model, page=int(page), top_n=12)
61 | except DocumentNotFoundError as exception:
62 | print(exception)
63 | raise BadRequest('Model Not Found')
64 |
65 | return json.dumps(url_details)
66 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/app/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-sce/webui/app/models/__init__.py
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/config.py:
--------------------------------------------------------------------------------
1 | """Flask config options"""
2 | import os
3 | # Statement for enabling the development environment
4 | DEBUG = True
5 |
6 | # Define the application directory
7 | BASE_DIR = os.path.abspath(os.path.dirname(__file__))
8 |
9 | # Application threads. A common general assumption is
10 | # using 2 per available processor cores - to handle
11 | # incoming requests using one and performing background
12 | # operations using the other.
13 | THREADS_PER_PAGE = 24
14 |
15 | # Enable protection agains *Cross-site Request Forgery (CSRF)*
16 | CSRF_ENABLED = True
17 |
18 | # Use a secure, unique and absolutely secret key for
19 | # signing the data.
20 | CSRF_SESSION_KEY = 'secretDD'
21 |
22 | # Secret key for signing cookies
23 | SECRET_KEY = 'secretDD'
24 |
25 | # Path to the uploads
26 | UPLOAD_FOLDER = '..'
27 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/keywords.txt:
--------------------------------------------------------------------------------
1 | This is a test
2 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/requirements.txt:
--------------------------------------------------------------------------------
1 | aniso8601==8.0.0
2 | astroid==2.3.3
3 | attrs==19.3.0
4 | beautifulsoup4==4.9.0
5 | bs4==0.0.1
6 | certifi==2020.4.5.1
7 | chardet==3.0.4
8 | click==7.1.1
9 | DateTime==4.3
10 | Flask==1.1.2
11 | Flask-Cors==3.0.9
12 | flask-restplus==0.13.0
13 | future==0.18.2
14 | idna==2.9
15 | importlib-metadata==1.6.0
16 | isort==4.3.21
17 | itsdangerous==1.1.0
18 | Jinja2==2.11.3
19 | joblib==0.14.1
20 | jsonschema==3.2.0
21 | lazy-object-proxy==1.4.3
22 | MarkupSafe==1.1.1
23 | mccabe==0.6.1
24 | numpy==1.21.0
25 | pyArango==1.3.4
26 | pylint==2.4.4
27 | pyrsistent==0.16.0
28 | pytz==2019.3
29 | PyYAML==5.4
30 | requests==2.23.0
31 | scikit-learn==0.22.2.post1
32 | scipy==1.4.1
33 | six==1.14.0
34 | sklearn==0.0
35 | soupsieve==2.0
36 | typed-ast==1.4.1
37 | urllib3==1.26.5
38 | waitress==1.4.3
39 | Werkzeug==0.16.1
40 | wrapt==1.11.2
41 | zipp==3.1.0
42 | zope.interface==5.1.0
43 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/run.py:
--------------------------------------------------------------------------------
1 | """Run the flask server"""
2 | from app import APP
3 |
4 | # Run Server
5 | if __name__ == '__main__':
6 | APP.run(host='0.0.0.0', port=5000, debug=True, threaded=True)
7 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/run.wsgi:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import logging
4 | import sys
5 | logging.basicConfig(stream=sys.stderr)
6 | sys.path.insert(0, '/sce/webui')
7 | from run import APP as application
8 | application.secret_key = 'anything you wish'
9 |
--------------------------------------------------------------------------------
/retired/sparkler-sce/webui/waitress_server.py:
--------------------------------------------------------------------------------
1 | """
2 | Run Flask App in production context
3 | """
4 |
5 | from waitress import serve
6 | import run
7 |
8 | serve(run.APP, host='0.0.0.0', port=5000)
9 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM uscdatascience/sce-domain-discovery:latest
2 |
3 | RUN echo 'hello'
4 |
5 | FROM ubuntu:bionic
6 |
7 | WORKDIR /usr/src/app
8 |
9 | RUN apt update && apt-get install -y sudo python3 python3-dev python3-pip docker.io apache2 libapache2-mod-wsgi-py3 curl && curl -sL https://deb.nodesource.com/setup_10.x | bash - && apt update && apt install -y nodejs
10 |
11 | #RUN adduser --disabled-password --gecos '' docker
12 | RUN adduser www-data sudo
13 |
14 | RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
15 | COPY package*.json ./
16 | COPY scripts/run.sh /
17 | RUN npm ci --only=production
18 | COPY . .
19 | RUN npm run build && apt update && chmod +x /run.sh && a2enmod proxy && a2enmod proxy_http && mkdir /var/www/html/explorer && cp -rf build/* /var/www/html/explorer/
20 | COPY scripts/000-default.conf /etc/apache2/sites-available/
21 |
22 | EXPOSE 8080
23 | EXPOSE 80
24 |
25 | COPY --from=0 /projects/sce-domain-discovery /sce
26 |
27 | RUN cd /sce/webui && pip3 install -r requirements.txt && mkdir /images && mkdir /models && chown www-data:www-data /images && chown www-data:www-data /models && gpasswd -a www-data docker
28 | RUN ln -sf /dev/stdout /var/log/apache2/access.log \
29 | && ln -sf /dev/stderr /var/log/apache2/error.log
30 | CMD [ "/run.sh" ]
31 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/database.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "database.arangodb.com/v1alpha"
2 | kind: "ArangoDeployment"
3 | metadata:
4 | name: "single-server"
5 | spec:
6 | mode: Single
7 |
8 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "sce-ui",
3 | "version": "0.1.0",
4 | "private": true,
5 | "homepage": "/explorer/",
6 | "dependencies": {
7 | "@blueprintjs/core": "^3.14.1",
8 | "arangojs": "5",
9 | "axios": "^0.21.1",
10 | "dotenv": "^7.0.0",
11 | "js-file-download": "^0.4.10",
12 | "location-origin": "^1.1.4",
13 | "nano": "^8.0.1",
14 | "react": "^16.8.5",
15 | "react-dom": "^16.8.5",
16 | "react-grid-system": "^4.4.3",
17 | "react-iframe": "^1.7.11",
18 | "react-redux": "^7.0.1",
19 | "react-router-dom": "^5.0.0",
20 | "react-scripts": "2.1.8",
21 | "redux": "^4.0.1",
22 | "redux-thunk": "^2.3.0"
23 | },
24 | "scripts": {
25 | "start": "react-scripts start",
26 | "build": "react-scripts build",
27 | "test": "react-scripts test",
28 | "eject": "react-scripts eject"
29 | },
30 | "eslintConfig": {
31 | "extends": "react-app"
32 | },
33 | "browserslist": [
34 | ">0.2%",
35 | "not dead",
36 | "not ie <= 11",
37 | "not op_mini all"
38 | ]
39 | }
40 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-ui/public/favicon.ico
--------------------------------------------------------------------------------
/retired/sparkler-ui/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
10 |
11 |
15 |
16 |
25 | React App
26 |
27 |
28 |
29 |
30 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "React App",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | }
10 | ],
11 | "start_url": ".",
12 | "display": "standalone",
13 | "theme_color": "#000000",
14 | "background_color": "#ffffff"
15 | }
16 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/scripts/000-default.conf:
--------------------------------------------------------------------------------
1 |
2 | # The ServerName directive sets the request scheme, hostname and port that
3 | # the server uses to identify itself. This is used when creating
4 | # redirection URLs. In the context of virtual hosts, the ServerName
5 | # specifies what hostname must appear in the request's Host: header to
6 | # match this virtual host. For the default virtual host (this file) this
7 | # value is not decisive as it is used as a last resort host regardless.
8 | # However, you must set it for any further virtual host explicitly.
9 | #ServerName www.example.com
10 |
11 | ServerAdmin webmaster@localhost
12 | DocumentRoot /var/www/html
13 |
14 | # Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
15 | # error, crit, alert, emerg.
16 | # It is also possible to configure the loglevel for particular
17 | # modules, e.g.
18 | #LogLevel info ssl:warn
19 |
20 | ErrorLog ${APACHE_LOG_DIR}/error.log
21 | CustomLog ${APACHE_LOG_DIR}/access.log combined
22 |
23 | ProxyPreserveHost On
24 | ProxyPass http://sce-solr:8983/banana
25 | ProxyPassReverse http://sce-solr:8983/banana
26 | Order allow,deny
27 | Allow from all
28 |
29 |
30 | ProxyPreserveHost On
31 | ProxyPass http://sce-solr:8983/solr
32 | ProxyPassReverse http://sce-solr:8983/solr
33 | Order allow,deny
34 | Allow from all
35 |
36 |
37 | WSGIScriptAlias /explorer-api /sce/webui/run.wsgi
38 | WSGIDaemonProcess hello user=www-data group=www-data threads=5
39 | WSGIScriptReloading On
40 |
41 | WSGIProcessGroup hello
42 | WSGIApplicationGroup %{GLOBAL}
43 | Options Indexes FollowSymLinks MultiViews
44 | AllowOverride None
45 | Order allow,deny
46 | Allow from all
47 | Require all granted
48 |
49 | # For most configuration files from conf-available/, which are
50 | # enabled or disabled at a global level, it is possible to
51 | # include a line for only one particular virtual host. For example the
52 | # following line enables the CGI configuration for this host only
53 | # after it has been globally disabled with "a2disconf".
54 | #Include conf-available/serve-cgi-bin.conf
55 |
56 |
57 | # vim: syntax=apache ts=4 sw=4 sts=4 sr noet
58 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | exec apachectl -D FOREGROUND
3 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/App.css:
--------------------------------------------------------------------------------
1 | @import "~normalize.css";
2 | @import "~@blueprintjs/core/lib/css/blueprint.css";
3 | @import "~@blueprintjs/icons/lib/css/blueprint-icons.css";
4 |
5 | .generatemodel > .bp3-input-group {
6 | display: inline-block !important;
7 | }
8 | .App {
9 | text-align: center;
10 | }
11 |
12 | .App-logo {
13 | animation: App-logo-spin infinite 20s linear;
14 | height: 40vmin;
15 | pointer-events: none;
16 | }
17 |
18 | .App-header {
19 | background-color: #282c34;
20 | min-height: 100vh;
21 | display: flex;
22 | flex-direction: column;
23 | align-items: center;
24 | justify-content: center;
25 | font-size: calc(10px + 2vmin);
26 | color: white;
27 | }
28 |
29 | .App-link {
30 | color: #61dafb;
31 | }
32 |
33 | @keyframes App-logo-spin {
34 | from {
35 | transform: rotate(0deg);
36 | }
37 | to {
38 | transform: rotate(360deg);
39 | }
40 | }
41 |
42 | .btn-circle {
43 |
44 | width: 30px;
45 | height: 30px;
46 | padding: 6px 0;
47 | border-radius: 15px;
48 | text-align: center;
49 | font-size: 12px;
50 | line-height: 1.428571429;
51 |
52 | }
53 |
54 | .green{
55 | background-color: #0d8050;
56 | color: whitesmoke;
57 | }
58 |
59 | .amber{
60 | background-color: #bf7326;
61 | color: whitesmoke;
62 | }
63 |
64 | .red{
65 | background-color: #c23030;
66 | color: whitesmoke;
67 | }
68 |
69 | .btn-padding{
70 | margin: 5px;
71 | }
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/App.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import './App.css';
3 | import Main from './views/Main'
4 | import Navbar from './components/Navbar'
5 |
6 | class App extends Component {
7 |
8 | render() {
9 | return (
10 |
11 |
12 |
13 |
14 | );
15 | }
16 | }
17 |
18 | export default App;
19 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/App.test.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom';
3 | import App from './App';
4 |
5 | it('renders without crashing', () => {
6 | const div = document.createElement('div');
7 | ReactDOM.render(, div);
8 | ReactDOM.unmountComponentAtNode(div);
9 | });
10 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/actions/test.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USCDataScience/sparkler/547b01087ef00ed6c39e527566712a1a85b64499/retired/sparkler-ui/src/actions/test.js
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/Banana.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react'
2 | import Iframe from "react-iframe";
3 |
4 | class Banana extends Component {
5 |
6 | render() {
7 | return (
8 |
9 |
10 |
11 | );
12 | }
13 | }
14 |
15 | export default Banana
16 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/RouterButton.js:
--------------------------------------------------------------------------------
1 | import {Route} from "react-router-dom";
2 | import React from "react";
3 | import {AnchorButton} from "@blueprintjs/core";
4 |
5 | export const RouterButton = (props) => (
6 | (
10 | props.history.push(props.to)}>
12 | {props.label}
13 |
14 | )}
15 | />
16 | );
17 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/Solr.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react'
2 | import Iframe from "react-iframe";
3 |
4 | class Banana extends Component {
5 |
6 | render() {
7 | return (
8 |
9 |
10 |
11 | );
12 | }
13 | }
14 |
15 | export default Banana
16 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/VisitDashboard.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 |
3 |
4 | class VisitDashboard extends Component {
5 |
6 | render() {
7 | return (
8 |
9 | )
10 | }
11 | }
12 |
13 | export default VisitDashboard
14 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/dialogs/ModelDialog.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import {Button, Classes, Dialog, Tab, Tabs} from "@blueprintjs/core";
3 | import ListModels from "../panels/ListModels";
4 | import NewModel from "../panels/NewModel";
5 | import {connect} from "react-redux";
6 |
7 | class ModelDialog extends Component {
8 |
9 | constructor(props) {
10 | super(props);
11 | this.state = {
12 | value: ''
13 | }
14 | this.handleChange = this.handleChange.bind(this);
15 |
16 | }
17 |
18 | componentDidUpdate(oldProps) {
19 | const newProps = this.props
20 | if (oldProps.current_model !== newProps.current_model) {
21 | this.handleClose()
22 | }
23 | }
24 |
25 |
26 | handleClose = () => this.setState({isOpen: false});
27 | handleOpen = () => this.setState({isOpen: true});
28 |
29 | handleChange(event) {
30 | this.setState({value: event.target.value});
31 | }
32 |
33 | render() {
34 | return (
35 |
55 |
56 | )
57 | }
58 |
59 | }
60 |
61 | const mapStateToProps = state => {
62 | return {
63 | current_model: state.modelreducer.current_model
64 | }
65 | }
66 |
67 | export default connect(mapStateToProps, null, null, {forwardRef: true})(ModelDialog)
68 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/panels/ListModels.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import {Card} from "@blueprintjs/core";
3 | import {enableModel, fetchAllModels} from "../../actions";
4 | import {connect} from "react-redux";
5 |
6 | class ListModels extends Component {
7 |
8 | componentWillMount() {
9 | this.props.fetchAllModels()
10 | }
11 |
12 | handleSelect = function (name) {
13 | this.props.enableModel(name)
14 | }
15 |
16 | render() {
17 | if (this.props.models) {
18 | return (
19 |
20 | {
21 | this.props.models.map((item, key) => {
22 | return
23 |
24 |
25 | })}
26 |
27 | )
28 | } else {
29 | return (
30 | No Models Available Yet
31 | )
32 | }
33 |
34 | }
35 | }
36 |
37 | const mapDispatchToProps = dispatch => ({
38 | fetchAllModels: () => dispatch(fetchAllModels()),
39 | enableModel: (name) => dispatch(enableModel(name))
40 |
41 | })
42 |
43 | const mapStateToProps = state => {
44 | return {
45 | models: state.modelreducer.models
46 | }
47 | }
48 |
49 | export default connect(mapStateToProps, mapDispatchToProps)(ListModels)
50 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/panels/NewModel.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import {Button, Card, InputGroup} from "@blueprintjs/core";
3 | import {FormGroup} from "@blueprintjs/core/lib/cjs";
4 | import {INTENT_PRIMARY} from "@blueprintjs/core/lib/cjs/common/classes";
5 | import {createNewModel} from "../../actions";
6 | import {connect} from "react-redux";
7 |
8 | class NewModel extends Component {
9 |
10 |
11 | constructor(props) {
12 | super(props);
13 | this.state = {};
14 | this.state['intent'] = INTENT_PRIMARY;
15 | this.state['model'] = '';
16 |
17 | this.handleChange = this.handleChange.bind(this);
18 | this.handleSubmit = this.handleSubmit.bind(this);
19 | }
20 |
21 | handleSubmit = function (e) {
22 |
23 | this.props.createNewModel(this.state.model)
24 |
25 |
26 | };
27 |
28 | handleChange = function (e) {
29 | this.setState({model: e.target.value})
30 | };
31 |
32 | render() {
33 | return
34 |
38 |
40 |
41 |
42 |
43 |
44 | }
45 | }
46 |
47 | const mapDispatchToProps = dispatch => ({
48 | createNewModel: (s) => dispatch(createNewModel(s))
49 | })
50 |
51 | export default connect(null, mapDispatchToProps)(NewModel)
52 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/sidebar/CrawlConfig.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import {AnchorButton, H5} from "@blueprintjs/core";
3 | import SeedURLDialog from "../dialogs/SeedURLDialog";
4 | import {connect} from "react-redux";
5 | import CrawlConfigDialog from "../dialogs/CrawlConfigDialog";
6 |
7 |
8 | class CrawlConfig extends Component {
9 |
10 |
11 | constructor(props) {
12 | super(props);
13 | this.state = {
14 | open: false
15 | };
16 | this.modalElement = React.createRef();
17 | }
18 |
19 | handleOpen = () => this.modalElement.current.handleOpen();
20 |
21 | render() {
22 | return (
23 |
24 |
Crawler Configuration
25 |
27 |
28 |
29 | )
30 | }
31 | }
32 |
33 | const mapStateToProps = state => {
34 | return {
35 | current_model: state.modelreducer.current_model
36 | }
37 | }
38 |
39 | export default connect(mapStateToProps)(CrawlConfig)
40 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/sidebar/CreateSeed.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from 'react';
2 | import {AnchorButton, H5} from "@blueprintjs/core";
3 | import SeedURLDialog from "../dialogs/SeedURLDialog";
4 | import {connect} from "react-redux";
5 |
6 |
7 | class CreateSeed extends Component {
8 |
9 |
10 | constructor(props) {
11 | super(props);
12 | this.state = {
13 | open: false
14 | };
15 | this.modalElement = React.createRef();
16 | }
17 |
18 | handleOpen = () => this.modalElement.current.handleOpen();
19 |
20 | render() {
21 | return (
22 |
23 |
Create Seed File
24 |
26 |
27 |
28 | )
29 | }
30 | }
31 |
32 | const mapStateToProps = state => {
33 | return {
34 | current_model: state.modelreducer.current_model
35 | }
36 | }
37 |
38 | export default connect(mapStateToProps)(CreateSeed)
39 |
--------------------------------------------------------------------------------
/retired/sparkler-ui/src/components/sidebar/ExportData.js:
--------------------------------------------------------------------------------
1 | import React, {Component} from "react";
2 | import {Button} from "@blueprintjs/core";
3 | import {connect} from "react-redux";
4 | import {crawlStatus, exportData, killCrawl, startCrawl} from "../../actions";
5 |
6 | class ExportData extends Component {
7 |
8 | constructor(props) {
9 | super(props);
10 | this.handleClick2 = this.handleClick2.bind(this);
11 |
12 | }
13 |
14 | handleClick2 = function () {
15 | this.props.export(this.props.current_model)
16 |
17 | }
18 |
19 | render() {
20 | return (
21 |