├── .gitattributes
├── .github
    └── tokenmill-logo.svg
├── .gitignore
├── .gitlab-ci.yml
├── Dockerfile.base
├── Dockerfile.crawler
├── Dockerfile.es
├── Dockerfile.ui
├── LICENSE
├── Makefile
├── README.md
├── administration-ui
    ├── conf
    │   ├── development.properties
    │   └── docker-compose.properties
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── lt
    │       │   │   └── tokenmill
    │       │   │       └── crawling
    │       │   │           └── adminui
    │       │   │               ├── Application.java
    │       │   │               ├── CrawlerAdminUI.java
    │       │   │               ├── HttpSourceTestsCache.java
    │       │   │               ├── utils
    │       │   │                   ├── CSVUtils.java
    │       │   │                   ├── GridUtils.java
    │       │   │                   ├── HttpSourceCSVUtils.java
    │       │   │                   └── HttpSourceTestCSVUtils.java
    │       │   │               └── view
    │       │   │                   ├── BaseView.java
    │       │   │                   ├── HttpSourceForm.java
    │       │   │                   ├── HttpSourceStatsWindow.java
    │       │   │                   ├── HttpSourceTestWindow.java
    │       │   │                   ├── HttpSourcesView.java
    │       │   │                   ├── ImportExportView.java
    │       │   │                   ├── imports
    │       │   │                       ├── HttpSourceImportExport.java
    │       │   │                       ├── HttpSourceTestImportExport.java
    │       │   │                       └── NamedQueryImportExport.java
    │       │   │                   ├── namedquery
    │       │   │                       ├── NamedQueriesView.java
    │       │   │                       ├── NamedQueryFormWindow.java
    │       │   │                       └── NamedQueryResultsPanel.java
    │       │   │                   ├── pageanalysis
    │       │   │                       └── PageAnalysisView.java
    │       │   │                   └── sourcetest
    │       │   │                       ├── HttpSourceAllTestsWindow.java
    │       │   │                       ├── HttpSourceTestFormWindow.java
    │       │   │                       ├── HttpSourceTestsView.java
    │       │   │                       └── TestResultsPanel.java
    │       ├── resources
    │       │   ├── log4j.properties
    │       │   └── log4j2.properties
    │       └── webapp
    │       │   └── VAADIN
    │       │       └── themes
    │       │           └── crawleradmintheme
    │       │               ├── addons.scss
    │       │               ├── crawleradmintheme.scss
    │       │               ├── styles.css
    │       │               └── styles.scss
    │   └── test
    │       ├── java
    │           └── lt
    │           │   └── tokenmill
    │           │       └── crawling
    │           │           └── adminui
    │           │               └── utils
    │           │                   ├── HttpSourceTestCSVUtilsTest.java
    │           │                   └── HttpSourcesCSVUtilsTest.java
    │       └── resources
    │           └── www.tokenmill.lt.html
├── analysis-ui
    ├── conf
    │   └── development.properties
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── lt
    │           │   └── tokenmill
    │           │       └── crawling
    │           │           └── analysisui
    │           │               ├── AnalysisUI.java
    │           │               ├── Application.java
    │           │               ├── search
    │           │                   └── ResultPanel.java
    │           │               └── view
    │           │                   ├── BaseView.java
    │           │                   ├── ContextCloudView.java
    │           │                   └── SearchView.java
    │       ├── resources
    │           └── log4j.properties
    │       └── webapp
    │           └── VAADIN
    │               └── themes
    │                   └── analysistheme
    │                       ├── addons.scss
    │                       ├── analysistheme.scss
    │                       ├── styles.css
    │                       └── styles.scss
├── bin
    ├── create-es-index.sh
    ├── create-es-indices.sh
    ├── deploy-crawler.sh
    ├── run-administration-ui.sh
    ├── run-analysis-ui.sh
    └── run-crawler.sh
├── crawler
    ├── conf
    │   ├── docker-compose.yaml
    │   └── local.yaml
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── lt
    │       │   │   └── tokenmill
    │       │   │       └── crawling
    │       │   │           └── crawler
    │       │   │               ├── CrawlerConstants.java
    │       │   │               ├── CrawlerTopology.java
    │       │   │               ├── DefaultServiceProvider.java
    │       │   │               ├── ServiceProvider.java
    │       │   │               ├── bolt
    │       │   │                   ├── ArticleIndexerBolt.java
    │       │   │                   ├── LinkExtractorBolt.java
    │       │   │                   └── StatusUpdaterBolt.java
    │       │   │               ├── spout
    │       │   │                   ├── HttpSourceConfiguration.java
    │       │   │                   └── UrlGeneratorSpout.java
    │       │   │               └── utils
    │       │   │                   ├── PrioritizedSource.java
    │       │   │                   ├── UrlFilterUtils.java
    │       │   │                   └── UrlFiltersCache.java
    │       └── resources
    │       │   ├── urlfilters.json
    │       │   └── urlfilters.txt
    │   └── test
    │       └── java
    │           └── lt
    │               └── tokenmill
    │                   └── crawling
    │                       └── crawler
    │                           └── spout
    │                               ├── UrlFilterUtilsTest.java
    │                               └── UrlGeneratorSpoutTest.java
├── data-model
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── lt
    │       │       └── tokenmill
    │       │           └── crawling
    │       │               └── data
    │       │                   ├── DataUtils.java
    │       │                   ├── HighlightedSearchResult.java
    │       │                   ├── HtmlAnalysisResult.java
    │       │                   ├── HttpArticle.java
    │       │                   ├── HttpArticleParseResult.java
    │       │                   ├── HttpSource.java
    │       │                   ├── HttpSourceTest.java
    │       │                   ├── HttpUrl.java
    │       │                   ├── NamedQuery.java
    │       │                   └── PageableList.java
    │   └── test
    │       └── java
    │           └── lt
    │               └── tokenmill
    │                   └── crawling
    │                       └── data
    │                           └── DataUtilsTest.java
├── docker-compose.dev.yml
├── docker-compose.run.yml
├── elasticsearch
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── lt
    │       │   │   └── tokenmill
    │       │   │       └── crawling
    │       │   │           └── es
    │       │   │               ├── BaseElasticOps.java
    │       │   │               ├── ElasticConnection.java
    │       │   │               ├── ElasticConstants.java
    │       │   │               ├── EsDataParser.java
    │       │   │               ├── EsDocumentOperations.java
    │       │   │               ├── EsHttpSourceOperations.java
    │       │   │               ├── EsHttpSourceTestOperations.java
    │       │   │               ├── EsHttpSourcesCache.java
    │       │   │               ├── EsHttpUrlOperations.java
    │       │   │               ├── EsNamedQueryOperations.java
    │       │   │               ├── Utils.java
    │       │   │               └── model
    │       │   │                   └── DateHistogramValue.java
    │       └── resources
    │       │   └── indices
    │       │       ├── document.json
    │       │       ├── http_source.json
    │       │       ├── http_source_test.json
    │       │       ├── query.json
    │       │       └── url.json
    │   └── test
    │       ├── java
    │           └── lt
    │           │   └── tokenmill
    │           │       └── crawling
    │           │           └── es
    │           │               ├── ElasticConnectionTest.java
    │           │               ├── ElasticsearchTestServer.java
    │           │               ├── EsDocumentOperationsTest.java
    │           │               ├── EsHttpSourceOperationsTest.java
    │           │               ├── EsHttpSourceTestOperationsTest.java
    │           │               ├── EsHttpUrlOperationsTestInt.java
    │           │               ├── IndexManager.java
    │           │               └── TestUtils.java
    │       └── resources
    │           ├── log4j.properties
    │           ├── log4j2.properties
    │           └── www.tokenmill.lt.html
├── page-analyzer
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── lt
    │       │       └── tokenmill
    │       │           └── crawling
    │       │               └── pageanalyzer
    │       │                   └── PageAnalyzer.java
    │   └── test
    │       ├── java
    │           └── lt
    │           │   └── tokenmill
    │           │       └── crawling
    │           │           └── pageanalyzer
    │           │               └── PageAnalyzerTest.java
    │       └── resources
    │           └── bloomberg.com.html
├── parser
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── lt
    │       │       └── tokenmill
    │       │           └── crawling
    │       │               └── parser
    │       │                   ├── ArticleExtractor.java
    │       │                   ├── DateParser.java
    │       │                   ├── PageAnalyzer.java
    │       │                   ├── TitleParser.java
    │       │                   ├── data
    │       │                       ├── MatchedDate.java
    │       │                       └── MatchedString.java
    │       │                   ├── urls
    │       │                       ├── UrlExtractor.java
    │       │                       └── UrlFilters.java
    │       │                   └── utils
    │       │                       ├── HttpSourceTester.java
    │       │                       ├── JsonLdParser.java
    │       │                       ├── QueryParser.java
    │       │                       ├── TextFilters.java
    │       │                       └── TextProfileSignature.java
    │   └── test
    │       ├── java
    │           └── lt
    │           │   └── tokenmill
    │           │       └── crawling
    │           │           └── parser
    │           │               ├── AljazeeraExtractorTest.java
    │           │               ├── BaseArticleExtractorTest.java
    │           │               ├── BloombergExtractorTest.java
    │           │               ├── CyberscoopExtractorTest.java
    │           │               ├── DateParserTest.java
    │           │               ├── FortuneExtractorTest.java
    │           │               ├── InvestingParserTest.java
    │           │               ├── JsonLdParserTest.java
    │           │               ├── KedainietisTest.java
    │           │               ├── ReutersExtractorTest.java
    │           │               ├── urls
    │           │                   ├── UrlExtractorTest.java
    │           │                   └── UrlFiltersTest.java
    │           │               └── utils
    │           │                   ├── HttpSourceTesterTest.java
    │           │                   ├── QueryParserTest.java
    │           │                   ├── TextFilterTest.java
    │           │                   └── TextProfileSignatureTest.java
    │       └── resources
    │           ├── articles
    │               ├── aljazeera1.html
    │               ├── bbc1.html
    │               ├── bloomberg1.html
    │               ├── cyberscoop1.html
    │               ├── fortune1.html
    │               ├── ft1.html
    │               ├── investing1.html
    │               ├── kedainietis.html
    │               ├── nbcnews1.html
    │               ├── reuters-blogs1.html
    │               ├── reuters1.html
    │               ├── reuters2.html
    │               ├── reuters3.html
    │               └── usanews1.html
    │           └── jsonld
    │               └── bbc-1.json
├── pom.xml
└── ui-commons
    ├── pom.xml
    └── src
        └── main
            └── java
                └── lt
                    └── tokenmill
                        └── crawling
                            └── commonui
                                ├── Configuration.java
                                └── ElasticSearch.java


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-vendored
2 | *.css linguist-vendored
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | target/
 3 | *.iml
 4 | *.retry
 5 | 
 6 | **/*.gwt.xml
 7 | crawler/logs/
 8 | **/.classpath
 9 | **/.project
10 | **/.settings
11 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   - base
 3 |   - test
 4 |   - build
 5 | 
 6 | prepare-base-docker:
 7 |   stage: base
 8 |   image: docker:stable
 9 |   when: manual
10 |   services:
11 |     - docker:dind
12 |   before_script:
13 |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
14 |   script:
15 |     - docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/base:latest .
16 |     - docker push registry.gitlab.com/tokenmill/crawling-framework/base:latest
17 |     - docker rmi registry.gitlab.com/tokenmill/crawling-framework/base:latest
18 | 
19 | prepare-base-elasticsearch:
20 |   stage: base
21 |   image: docker:stable
22 |   when: manual
23 |   services:
24 |     - docker:dind
25 |   before_script:
26 |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
27 |   script:
28 |     - docker build -f Dockerfile.es -t registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest .
29 |     - docker push registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
30 |     - docker rmi registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
31 | 
32 | prepare-administration-ui:
33 |   stage: base
34 |   image: docker:stable
35 |   when: manual
36 |   services:
37 |     - docker:dind
38 |   before_script:
39 |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
40 |   script:
41 |     - docker build -f Dockerfile.ui -t registry.gitlab.com/tokenmill/crawling-framework/ui:latest .
42 |     - docker push registry.gitlab.com/tokenmill/crawling-framework/ui:latest
43 |     - docker rmi registry.gitlab.com/tokenmill/crawling-framework/ui:latest
44 | 
45 | prepare-crawler:
46 |   stage: base
47 |   image: docker:stable
48 |   when: manual
49 |   services:
50 |     - docker:dind
51 |   before_script:
52 |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
53 |   script:
54 |     - docker build -f Dockerfile.crawler -t registry.gitlab.com/tokenmill/crawling-framework/crawler:latest .
55 |     - docker push registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
56 |     - docker rmi registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
57 | 
58 | unit-tests:
59 |   stage: test
60 |   image: registry.gitlab.com/tokenmill/crawling-framework/base:latest
61 |   when: always
62 |   script:
63 |     - mvn clean test
64 | 
65 | integration-tests:
66 |   stage: test
67 |   image: registry.gitlab.com/tokenmill/crawling-framework/base:latest
68 |   services:
69 |     - name: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
70 |       alias: elasticsearch
71 |   when: always
72 |   script:
73 |     - mvn -Dtest=*TestInt -DfailIfNoTests=false clean test
74 | 


--------------------------------------------------------------------------------
/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | FROM maven:3.5.4-jdk-8-alpine as builder
 2 | 
 3 | RUN mkdir -p /usr/src/cf
 4 | WORKDIR /usr/src/cf
 5 | 
 6 | COPY . .
 7 | 
 8 | RUN mvn clean install
 9 | 
10 | FROM maven:3.5.4-jdk-8-alpine
11 | COPY --from=builder /root/.m2/ /root/.m2/
12 | 


--------------------------------------------------------------------------------
/Dockerfile.crawler:
--------------------------------------------------------------------------------
 1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder
 2 | 
 3 | RUN mkdir -p /usr/src/cf
 4 | WORKDIR /usr/src/cf
 5 | 
 6 | COPY . .
 7 | 
 8 | RUN cd crawler && \
 9 |     mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests
10 | 
11 | FROM maven:3.5.4-jdk-8-alpine
12 | RUN mkdir -p /usr/src/cf
13 | WORKDIR /usr/src/cf
14 | 
15 | COPY --from=builder /usr/src/cf/crawler/target/crawler-standalone.jar crawler-standalone.jar
16 | COPY --from=builder /usr/src/cf/crawler/conf/docker-compose.yaml docker-compose.yaml
17 | 
18 | CMD ["java", "-cp", "crawler-standalone.jar", "lt.tokenmill.crawling.crawler.CrawlerTopology", "-local", "-conf", "docker-compose.yaml"]
19 | 


--------------------------------------------------------------------------------
/Dockerfile.es:
--------------------------------------------------------------------------------
 1 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 as builder
 2 | 
 3 | ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/e1f115e4ca285c3c24e847c4dd4be955e0ed51c2/wait-for-it.sh /utils/wait-for-it.sh
 4 | 
 5 | COPY bin/ bin/
 6 | COPY elasticsearch/ elasticsearch/
 7 | 
 8 | RUN /usr/local/bin/docker-entrypoint.sh elasticsearch -p /tmp/epid & /bin/bash /utils/wait-for-it.sh -t 0 localhost:9200 -- \
 9 |     ./bin/create-es-indices.sh ; \
10 |     kill $(cat /tmp/epid) && wait $(cat /tmp/epid); exit 0;
11 | 
12 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0
13 | 
14 | COPY --from=builder /usr/share/elasticsearch/data /usr/share/elasticsearch/data
15 | 


--------------------------------------------------------------------------------
/Dockerfile.ui:
--------------------------------------------------------------------------------
 1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder
 2 | 
 3 | RUN mkdir -p /usr/src/cf
 4 | WORKDIR /usr/src/cf
 5 | 
 6 | COPY . .
 7 | 
 8 | RUN cd administration-ui && mvn clean package -Pbigjar
 9 | 
10 | FROM maven:3.5.4-jdk-8-alpine
11 | RUN mkdir -p /usr/src/cf
12 | WORKDIR /usr/src/cf
13 | 
14 | COPY --from=builder /usr/src/cf/administration-ui/target/administration-ui-standalone.jar administration-ui-standalone.jar
15 | COPY --from=builder /usr/src/cf/administration-ui/conf/docker-compose.properties docker-compose.properties
16 | 
17 | CMD ["java", "-Dconfig=docker-compose.properties", "-jar", "administration-ui-standalone.jar"]
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017-2019 Tokenmill, UAB
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | unit-test:
 2 | 	mvn clean test
 3 | 
 4 | run-dev-env:
 5 | 	docker-compose -f docker-compose.dev.yml pull && \
 6 | 	docker-compose -f docker-compose.dev.yml down && \
 7 | 	docker-compose -f docker-compose.dev.yml build && \
 8 | 	docker-compose -f docker-compose.dev.yml up --remove-orphans
 9 | 
10 | build-base-docker:
11 | 	docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/deps:latest .
12 | 
13 | publish-base-docker: build-base-docker
14 | 	docker push registry.gitlab.com/tokenmill/crawling-framework/deps:latest
15 | 
16 | run-framework:
17 | 	docker-compose -f docker-compose.run.yml pull && \
18 | 	docker-compose -f docker-compose.run.yml down && \
19 | 	docker-compose -f docker-compose.run.yml build && \
20 | 	docker-compose -f docker-compose.run.yml up --remove-orphans
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="http://www.tokenmill.lt">
 2 |       <img src=".github/tokenmill-logo.svg" width="125" height="125" align="right" />
 3 | </a>
 4 | 
 5 | # Crawling Framework
 6 | 
 7 | [![Maven Central](https://img.shields.io/maven-central/v/lt.tokenmill.crawling/crawling-framework.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22lt.tokenmill.crawling%22%20AND%20a:%22crawling-framework%22)
 8 | [![pipeline status](https://gitlab.com/tokenmill/crawling-framework/badges/master/pipeline.svg)](https://gitlab.com/tokenmill/crawling-framework/commits/master)
 9 | 
10 | Crawling Framework aims at providing instruments to configure and run your [Storm Crawler](http://stormcrawler.net/) based crawler. It mainly aims at easing crawling of article content publishing sites like news portals or blog sites. With the help of GUI tool Crawling Framework provides you can:
11 | 
12 | 1. Specify which sites to crawl.
13 | 1. Configure URL inclusion and exclusion filters, thus controlling which sections of the site will be fetched.
14 | 1. Specify which elements of the page provide information about article publication name, its title and main body.
15 | 1. Define tests which validate that extraction rules are working.
16 | 
17 | Once configuration is done the Crawling Framework runs [Storm Crawler](http://stormcrawler.net/) based crawling following the rules specified in the configuration.
18 | 
19 | ## Introduction
20 | 
21 | We have recorded a video on how to setup and use Crawling Framework. Click on the image below to watch in on Youtube.
22 | 
23 | [![Crawling Framework Intro](https://img.youtube.com/vi/AvO4lmmIuis/0.jpg)](https://www.youtube.com/watch?v=AvO4lmmIuis)
24 | 
25 | ## Requirements
26 | 
27 | Framework writes its configuration and stores crawled data to ElasticSearch. Before starting crawl project [install ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html) (Crawling Framework is tested to work with Elastic v7.x).
28 | 
29 | Crawling Framework is a Java lib which will have to be extended to run Storm Crawler topology, thus Java (JDK8, Maven) infrastructure will be needed. 
30 | 
31 | ### Using password protected ElasticSearch
32 | 
33 | Some providers hide ElasticSearch under authentification step (Which makes sense). Just set environment variables `ES_USERNAME` and `ES_PASSWORD` accordingly, everything else can remain the same. Authentification step will be done implicitly if proper credentials are there
34 | 
35 | ## Configuring and Running a crawl
36 | 
37 | See [Crawling Framework Example](https://github.com/tokenmill/crawling-framework-example) project's documentation.
38 | 
39 | 
40 | ## License
41 | 
42 | Copyright &copy; 2017-2019 [TokenMill UAB](http://www.tokenmill.ai).
43 | 
44 | Distributed under the The Apache License, Version 2.0.
45 | 


--------------------------------------------------------------------------------
/administration-ui/conf/development.properties:
--------------------------------------------------------------------------------
 1 | port=8081
 2 | es.hostname=localhost
 3 | es.transport.port=9300
 4 | es.httpsource.index.name=http_sources
 5 | es.httpsource.doc.type=http_source
 6 | es.httpsourcetest.index.name=http_source_tests
 7 | es.httpsourcetest.doc.type=http_source_test
 8 | es.namedqueries.index.name=named_queries
 9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc
12 | es.urls.index.name=urls
13 | es.urls.doc.type=url


--------------------------------------------------------------------------------
/administration-ui/conf/docker-compose.properties:
--------------------------------------------------------------------------------
 1 | port=8081
 2 | es.hostname=elasticsearch
 3 | es.transport.port=9300
 4 | es.httpsource.index.name=http_sources
 5 | es.httpsource.doc.type=http_source
 6 | es.httpsourcetest.index.name=http_source_tests
 7 | es.httpsourcetest.doc.type=http_source_test
 8 | es.namedqueries.index.name=named_queries
 9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc
12 | es.urls.index.name=urls
13 | es.urls.doc.type=url


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/Application.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui;
 2 | 
 3 | import com.vaadin.server.VaadinServlet;
 4 | import lt.tokenmill.crawling.commonui.Configuration;
 5 | import org.eclipse.jetty.security.*;
 6 | import org.eclipse.jetty.security.authentication.BasicAuthenticator;
 7 | import org.eclipse.jetty.server.Server;
 8 | import org.eclipse.jetty.servlet.ServletContextHandler;
 9 | import org.eclipse.jetty.servlet.ServletHolder;
10 | import org.eclipse.jetty.util.security.Constraint;
11 | import org.eclipse.jetty.util.security.Credential;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | 
16 | public class Application {
17 | 
18 |     private static final Logger LOG = LoggerFactory.getLogger(Application.class);
19 |     private static final Boolean PRODUCTION_MODE = true;
20 | 
21 |     private static SecurityHandler basicAuth(String username, String password, String realm) {
22 | 
23 |         HashLoginService l = new HashLoginService();
24 |         l.putUser(username, Credential.getCredential(password), new String[]{"editor"});
25 |         l.setName(realm);
26 | 
27 |         Constraint constraint = new Constraint();
28 |         constraint.setName(Constraint.__BASIC_AUTH);
29 |         constraint.setRoles(new String[]{"editor"});
30 |         constraint.setAuthenticate(true);
31 | 
32 |         ConstraintMapping cm = new ConstraintMapping();
33 |         cm.setConstraint(constraint);
34 |         cm.setPathSpec("/*");
35 | 
36 |         ConstraintSecurityHandler csh = new ConstraintSecurityHandler();
37 |         csh.setAuthenticator(new BasicAuthenticator());
38 |         csh.setRealmName("cf");
39 |         csh.addConstraintMapping(cm);
40 |         csh.setLoginService(l);
41 | 
42 |         return csh;
43 | 
44 |     }
45 | 
46 |     public static void main(String[] args) {
47 |         int port = Configuration.INSTANCE.getInt("port", 8080);
48 |         Server server = new Server(port);
49 |         ServletContextHandler contextHandler
50 |                 = new ServletContextHandler(ServletContextHandler.SESSIONS);
51 | 
52 |         boolean authEnabled = Boolean.parseBoolean(Configuration.INSTANCE.getString("basicAuth", "false"));
53 | 
54 |         if(authEnabled) {
55 |             contextHandler.setSecurityHandler(basicAuth(System.getenv("UI_USER"), System.getenv("UI_PASSWORD"), "editor"));
56 |         }
57 |         contextHandler.setContextPath("/");
58 |         ServletHolder sh = new ServletHolder(new VaadinServlet());
59 |         contextHandler.addServlet(sh, "/*");
60 |         contextHandler.setInitParameter("ui", CrawlerAdminUI.class.getCanonicalName());
61 |         contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE));
62 |         server.setHandler(contextHandler);
63 | 
64 | 
65 |         try {
66 |             server.start();
67 |             server.join();
68 |         } catch (Exception e) {
69 |             LOG.error("Failed to start application", e);
70 |         }
71 |     }
72 | }


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/CrawlerAdminUI.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui;
 2 | 
 3 | import com.vaadin.annotations.Theme;
 4 | import com.vaadin.annotations.VaadinServletConfiguration;
 5 | import com.vaadin.server.VaadinRequest;
 6 | import com.vaadin.server.VaadinServlet;
 7 | import com.vaadin.ui.UI;
 8 | import lt.tokenmill.crawling.adminui.view.HttpSourcesView;
 9 | 
10 | import javax.servlet.annotation.WebServlet;
11 | 
12 | @Theme("crawleradmintheme")
13 | public class CrawlerAdminUI extends UI {
14 | 
15 |     @Override
16 |     protected void init(VaadinRequest vaadinRequest) {
17 |         setContent(new HttpSourcesView());
18 |     }
19 | 
20 |     @WebServlet(urlPatterns = "/*", name = "CrawlerAdminUIServlet", asyncSupported = true)
21 |     @VaadinServletConfiguration(ui = CrawlerAdminUI.class, productionMode = false)
22 |     public static class CrawlerAdminUIServlet extends VaadinServlet {
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/HttpSourceTestsCache.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui;
 2 | 
 3 | import com.google.common.base.Strings;
 4 | import com.google.common.cache.Cache;
 5 | import com.google.common.cache.CacheBuilder;
 6 | 
 7 | import java.util.concurrent.TimeUnit;
 8 | 
 9 | public class HttpSourceTestsCache {
10 | 
11 |     private static final Cache<String, HttpSourceTest> CACHE = CacheBuilder
12 |             .newBuilder()
13 |             .maximumSize(1000)
14 |             .expireAfterWrite(5, TimeUnit.DAYS)
15 |             .build();
16 | 
17 |     public static HttpSourceTest get(String sourceUrl) {
18 |         HttpSourceTest test = CACHE.getIfPresent(sourceUrl.toLowerCase());
19 |         return test != null ? test : new HttpSourceTest("", "");
20 |     }
21 | 
22 |     public static void put(String sourceUrl, String url, String html) {
23 |         CACHE.put(sourceUrl.toLowerCase(),
24 |                 new HttpSourceTest(Strings.nullToEmpty(url), Strings.nullToEmpty(html)));
25 |     }
26 | 
27 |     public static class HttpSourceTest {
28 | 
29 |         private String url;
30 |         private String html;
31 | 
32 |         public HttpSourceTest(String url, String html) {
33 |             this.url = url;
34 |             this.html = html;
35 |         }
36 | 
37 |         public String getUrl() {
38 |             return url;
39 |         }
40 | 
41 |         public void setUrl(String url) {
42 |             this.url = url;
43 |         }
44 | 
45 |         public String getHtml() {
46 |             return html;
47 |         }
48 | 
49 |         public void setHtml(String html) {
50 |             this.html = html;
51 |         }
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/CSVUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import com.google.common.collect.Maps;
 4 | import com.opencsv.CSVReader;
 5 | import com.opencsv.CSVWriter;
 6 | 
 7 | import java.io.Reader;
 8 | import java.io.StringReader;
 9 | import java.io.Writer;
10 | import java.util.Map;
11 | 
12 | public class CSVUtils {
13 | 
14 |     private static final char DEFAULT_SEPARATOR = ',';
15 |     private static final char DEFAULT_QUOTE = '\"';
16 |     private static final char DEFAULT_ESCAPE = '\\';
17 | 
18 |     public static CSVWriter createDefaultWriter(Writer writer) {
19 |         return new CSVWriter(writer, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE);
20 |     }
21 | 
22 |     public static CSVReader createDefaultReader(Reader reader) {
23 |         return new CSVReader(reader, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE);
24 |     }
25 | 
26 |     public static CSVReader createDefaultReader(String csv) {
27 |         return createDefaultReader(new StringReader(csv));
28 |     }
29 | 
30 |     public static Map<String, Integer> resolveColumnIndexes(String[] columns, String[] headers) {
31 |         Map<String, Integer> result = Maps.newHashMap();
32 |         for (String column : columns) {
33 |             for (int i = 0; i < headers.length; i++) {
34 |                 if (headers[i].equalsIgnoreCase(column)) {
35 |                     result.put(column, i);
36 |                 }
37 |             }
38 |         }
39 |         return result;
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/GridUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import com.google.common.base.Joiner;
 4 | import com.vaadin.data.Item;
 5 | import com.vaadin.data.util.PropertyValueGenerator;
 6 | import com.vaadin.data.util.converter.Converter;
 7 | 
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import java.util.Locale;
11 | 
12 | public class GridUtils {
13 | 
14 |     public static class StringListConverter implements Converter<String, List> {
15 |         @Override
16 |         public List convertToModel(String s, Class<? extends List> aClass, Locale locale) throws ConversionException {
17 |             return new ArrayList();
18 |         }
19 | 
20 |         @Override
21 |         public String convertToPresentation(List list, Class<? extends String> aClass, Locale locale) throws ConversionException {
22 |             return Joiner.on(", ").join(list);
23 |         }
24 | 
25 |         @Override
26 |         public Class<List> getModelType() {
27 |             return List.class;
28 |         }
29 | 
30 |         @Override
31 |         public Class<String> getPresentationType() {
32 |             return String.class;
33 |         }
34 |     }
35 | 
36 |     public static class UrlToLinkConverter implements Converter<String, String> {
37 | 
38 |         @Override
39 |         public String convertToModel(String string, Class<? extends String> aClass, Locale locale) throws ConversionException {
40 |             return string;
41 |         }
42 | 
43 |         @Override
44 |         public String convertToPresentation(String string, Class<? extends String> aClass, Locale locale) throws ConversionException {
45 |             return String.format("<a href=\"%s\" target=\"_blank\">%s</a>", string, string);
46 |         }
47 | 
48 |         @Override
49 |         public Class<String> getModelType() {
50 |             return String.class;
51 |         }
52 | 
53 |         @Override
54 |         public Class<String> getPresentationType() {
55 |             return String.class;
56 |         }
57 |     }
58 | 
59 |     public static class ButtonPropertyGenerator extends PropertyValueGenerator<String> {
60 | 
61 | 
62 |         private String name;
63 | 
64 |         public ButtonPropertyGenerator(String name) {
65 |             this.name = name;
66 |         }
67 | 
68 |         @Override
69 |         public String getValue(Item item, Object itemId, Object propertyId) {
70 |             return name;
71 |         }
72 | 
73 |         @Override
74 |         public Class<String> getType() {
75 |             return String.class;
76 |         }
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceCSVUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import com.google.common.base.Strings;
 4 | import lt.tokenmill.crawling.data.DataUtils;
 5 | import lt.tokenmill.crawling.data.HttpSource;
 6 | import lt.tokenmill.crawling.es.Utils;
 7 | 
 8 | import java.util.Map;
 9 | import java.util.Objects;
10 | 
11 | public class HttpSourceCSVUtils {
12 | 
13 |     public static final String[] CSV_COLUMNS = new String[]{
14 |             "url", "name", "language", "timezone", "enabled",
15 |             "discovery_enabled", "url_crawl_delay_secs", "feed_crawl_delay_secs",
16 |             "sitemap_crawl_delay_secs", "urls", "feeds", "sitemaps",
17 |             "categories", "app_ids",
18 |             "url_filters", "url_normalizers", "title_selectors",
19 |             "text_selectors", "text_normalizers",
20 |             "date_selectors", "date_regexps", "date_formats"};
21 | 
22 |     public static String[] mapHttpSourceToCsvRow(HttpSource ld) {
23 |         return new String[]{
24 |                 ld.getUrl(), ld.getName(), ld.getLanguage(), ld.getTimezone(),
25 |                 String.valueOf(ld.isEnabled()), String.valueOf(ld.isDiscoveryEnabled()),
26 |                 Objects.toString(ld.getUrlRecrawlDelayInSecs(), ""),
27 |                 Objects.toString(ld.getFeedRecrawlDelayInSecs(), ""),
28 |                 Objects.toString(ld.getSitemapRecrawlDelayInSecs(), ""),
29 |                 Utils.listToText(ld.getUrls()), Utils.listToText(ld.getFeeds()), Utils.listToText(ld.getSitemaps()),
30 |                 Utils.listToText(ld.getCategories()), Utils.listToText(ld.getAppIds()),
31 |                 Utils.listToText(ld.getUrlFilters()), Utils.listToText(ld.getUrlNormalizers()),
32 |                 Utils.listToText(ld.getTitleSelectors()),
33 |                 Utils.listToText(ld.getTextSelectors()), Utils.listToText(ld.getTextNormalizers()),
34 |                 Utils.listToText(ld.getDateSelectors()), Utils.listToText(ld.getDateRegexps()),
35 |                 Utils.listToText(ld.getDateFormats())
36 |         };
37 |     }
38 | 
39 |     public static HttpSource mapCsvRowToHttpSource(String[] row, Map<String, Integer> columnIndexes) {
40 |         HttpSource hs = new HttpSource();
41 |         hs.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")]));
42 |         hs.setName(Strings.emptyToNull(row[columnIndexes.get("name")]));
43 |         hs.setLanguage(Strings.emptyToNull(row[columnIndexes.get("language")]));
44 |         hs.setTimezone(Strings.emptyToNull(row[columnIndexes.get("timezone")]));
45 |         hs.setEnabled(Boolean.parseBoolean(row[columnIndexes.get("enabled")]));
46 |         hs.setDiscoveryEnabled(Boolean.parseBoolean(row[columnIndexes.get("discovery_enabled")]));
47 |         hs.setUrlRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("url_crawl_delay_secs")]));
48 |         hs.setFeedRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("feed_crawl_delay_secs")]));
49 |         hs.setSitemapRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("sitemap_crawl_delay_secs")]));
50 |         hs.setUrls(DataUtils.parseStringList(row[columnIndexes.get("urls")]));
51 |         hs.setFeeds(DataUtils.parseStringList(row[columnIndexes.get("feeds")]));
52 |         hs.setSitemaps(DataUtils.parseStringList(row[columnIndexes.get("sitemaps")]));
53 |         hs.setCategories(DataUtils.parseStringList(row[columnIndexes.get("categories")]));
54 |         hs.setAppIds(DataUtils.parseStringList(row[columnIndexes.get("app_ids")]));
55 |         hs.setUrlFilters(DataUtils.parseStringList(row[columnIndexes.get("url_filters")]));
56 |         hs.setUrlNormalizers(DataUtils.parseStringList(row[columnIndexes.get("url_normalizers")]));
57 |         hs.setTitleSelectors(DataUtils.parseStringList(row[columnIndexes.get("title_selectors")]));
58 |         hs.setTextSelectors(DataUtils.parseStringList(row[columnIndexes.get("text_selectors")]));
59 |         hs.setTextNormalizers(DataUtils.parseStringList(row[columnIndexes.get("text_normalizers")]));
60 |         hs.setDateSelectors(DataUtils.parseStringList(row[columnIndexes.get("date_selectors")]));
61 |         hs.setDateRegexps(DataUtils.parseStringList(row[columnIndexes.get("date_regexps")]));
62 |         hs.setDateFormats(DataUtils.parseStringList(row[columnIndexes.get("date_formats")]));
63 |         return hs;
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import com.google.common.base.Charsets;
 4 | import com.google.common.base.Strings;
 5 | import com.google.common.io.BaseEncoding;
 6 | import lt.tokenmill.crawling.data.HttpSourceTest;
 7 | 
 8 | import java.util.Map;
 9 | import java.util.Objects;
10 | 
11 | public class HttpSourceTestCSVUtils {
12 | 
13 |     public static final String[] CSV_COLUMNS = new String[]{
14 |             "url", "source", "html", "url_accepted", "title", "text", "date"};
15 | 
16 |     public static String[] mapHttpSourceTestToCsvRow(HttpSourceTest httpSourceTest) {
17 |         return new String[]{
18 |                 httpSourceTest.getUrl(), httpSourceTest.getSource(),
19 |                 BaseEncoding.base64().encode(httpSourceTest.getHtml().getBytes(Charsets.UTF_8)),
20 |                 Objects.toString(httpSourceTest.getUrlAccepted(), "false"),
21 |                 Strings.nullToEmpty(httpSourceTest.getTitle()),
22 |                 Strings.nullToEmpty(httpSourceTest.getText()),
23 |                 Strings.nullToEmpty(httpSourceTest.getDate())
24 |         };
25 |     }
26 | 
27 |     public static HttpSourceTest mapCsvRowToHttpSourceTest(String[] row, Map<String, Integer> columnIndexes) {
28 |         HttpSourceTest hst = new HttpSourceTest();
29 |         hst.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")]));
30 |         hst.setSource(Strings.emptyToNull(row[columnIndexes.get("source")]));
31 |         hst.setHtml(new String(BaseEncoding.base64().decode(row[columnIndexes.get("html")]), Charsets.UTF_8));
32 |         hst.setUrlAccepted(Boolean.parseBoolean(row[columnIndexes.get("url_accepted")]));
33 |         hst.setTitle(Strings.emptyToNull(row[columnIndexes.get("title")]));
34 |         hst.setText(Strings.emptyToNull(row[columnIndexes.get("text")]));
35 |         hst.setDate(Strings.emptyToNull(row[columnIndexes.get("date")]));
36 |         return hst;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/BaseView.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view;
 2 | 
 3 | 
 4 | import com.vaadin.ui.HorizontalLayout;
 5 | import com.vaadin.ui.MenuBar;
 6 | import com.vaadin.ui.UI;
 7 | import com.vaadin.ui.VerticalLayout;
 8 | import lt.tokenmill.crawling.adminui.view.namedquery.NamedQueriesView;
 9 | import lt.tokenmill.crawling.adminui.view.pageanalysis.PageAnalysisView;
10 | import lt.tokenmill.crawling.adminui.view.sourcetest.HttpSourceTestsView;
11 | 
12 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
13 | 
14 | public class BaseView extends VerticalLayout {
15 | 
16 |     public BaseView(String title) {
17 |         UI.getCurrent().getPage().setTitle(String.format("Crawler Admin | %s", title));
18 |         setWidth(100, PERCENTAGE);
19 |         setSpacing(true);
20 |         setMargin(true);
21 | 
22 |         HorizontalLayout actionBarLayout = new HorizontalLayout();
23 |         actionBarLayout.setWidth(100, PERCENTAGE);
24 | 
25 |         MenuBar menu = new MenuBar();
26 | 
27 |         MenuBar.MenuItem dataItem = menu.addItem("Configuration", null);
28 |         dataItem.addItem("HTTP Sources", (item) -> UI.getCurrent().setContent(new HttpSourcesView()));
29 |         dataItem.addItem("HTTP Source Tests", (item) -> UI.getCurrent().setContent(new HttpSourceTestsView()));
30 |         dataItem.addItem("Named Queries", (item) -> UI.getCurrent().setContent(new NamedQueriesView()));
31 |         dataItem.addItem("Import / Export", (item) -> UI.getCurrent().setContent(new ImportExportView()));
32 | 
33 |         menu.addItem("Page Analysis", (item) -> UI.getCurrent().setContent(new PageAnalysisView()));
34 | 
35 |         actionBarLayout.addComponent(menu);
36 | 
37 |         addComponent(actionBarLayout);
38 |     }
39 | 
40 | }


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/HttpSourceStatsWindow.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view;
 2 | 
 3 | 
 4 | import com.byteowls.vaadin.chartjs.ChartJs;
 5 | import com.byteowls.vaadin.chartjs.config.BarChartConfig;
 6 | import com.byteowls.vaadin.chartjs.data.BarDataset;
 7 | import com.byteowls.vaadin.chartjs.data.Dataset;
 8 | import com.byteowls.vaadin.chartjs.data.LineDataset;
 9 | import com.byteowls.vaadin.chartjs.options.Position;
10 | import com.vaadin.ui.Component;
11 | import com.vaadin.ui.Window;
12 | import lt.tokenmill.crawling.commonui.ElasticSearch;
13 | import lt.tokenmill.crawling.es.model.DateHistogramValue;
14 | 
15 | import java.util.List;
16 | import java.util.stream.Collectors;
17 | 
18 | public class HttpSourceStatsWindow extends Window {
19 | 
20 |     public HttpSourceStatsWindow(String sourceUrl) {
21 |         setModal(true);
22 |         center();
23 |         setCaption(String.format("%s crawling statistics", sourceUrl));
24 |         setWidth(50, Unit.PERCENTAGE);
25 |         setHeight(50, Unit.PERCENTAGE);
26 |         List<DateHistogramValue> urls = ElasticSearch.getUrlOperations().calculateStats(sourceUrl);
27 |         List<DateHistogramValue> documents = ElasticSearch.getDocumentOperations().calculateStats(sourceUrl);
28 |         Component layout = getChart(sourceUrl, urls, documents);
29 |         layout.setWidth(100, Unit.PERCENTAGE);
30 |         setContent(layout);
31 |     }
32 | 
33 |     public Component getChart(String sourceUrl, List<DateHistogramValue> urls, List<DateHistogramValue> documents) {
34 |         BarChartConfig config = new BarChartConfig();
35 | 
36 |         BarDataset docsDataset = new BarDataset().type().label("Fetched Documents")
37 |                 .borderColor("rgb(54, 162, 235)")
38 |                 .backgroundColor("rgb(54, 162, 235)")
39 |                 .borderWidth(2);
40 |         documents.forEach(d -> docsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue())));
41 | 
42 |         LineDataset urlsDataset = new LineDataset().type().label("Discovered Urls")
43 |                 .borderColor("rgb(75, 192, 192)")
44 |                 .backgroundColor("white")
45 |                 .borderWidth(2);
46 |         urls.forEach(d -> urlsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue())));
47 | 
48 |         config.data()
49 |                 .labelsAsList(urls.stream().map(DateHistogramValue::getDate).collect(Collectors.toList()))
50 |                 .addDataset(docsDataset)
51 |                 .addDataset(urlsDataset)
52 |                 .and();
53 | 
54 |         config.options()
55 |                 .responsive(true)
56 |                 .title()
57 |                 .display(true)
58 |                 .position(Position.LEFT)
59 |                 .and()
60 |                 .done();
61 | 
62 |         ChartJs chart = new ChartJs(config);
63 |         chart.setJsLoggingEnabled(true);
64 |         return chart;
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/ImportExportView.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view;
 2 | 
 3 | import com.vaadin.ui.TabSheet;
 4 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceImportExport;
 5 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceTestImportExport;
 6 | import lt.tokenmill.crawling.adminui.view.imports.NamedQueryImportExport;
 7 | 
 8 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
 9 | 
10 | public class ImportExportView extends BaseView {
11 | 
12 |     public ImportExportView() {
13 |         super("Import / Export");
14 |         TabSheet mainLayout = new TabSheet();
15 |         mainLayout.setWidth(100, PERCENTAGE);
16 |         mainLayout.addTab(new HttpSourceImportExport(), "HTTP Sources");
17 |         mainLayout.addTab(new HttpSourceTestImportExport(), "HTTP Source Tests");
18 |         mainLayout.addTab(new NamedQueryImportExport(), "Named Queries");
19 |         addComponent(mainLayout);
20 |     }
21 | 
22 | 
23 | 
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueriesView.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view.namedquery;
 2 | 
 3 | import com.vaadin.data.util.BeanItemContainer;
 4 | import com.vaadin.data.util.GeneratedPropertyContainer;
 5 | import com.vaadin.ui.*;
 6 | import lt.tokenmill.crawling.adminui.view.BaseView;
 7 | import lt.tokenmill.crawling.commonui.ElasticSearch;
 8 | import lt.tokenmill.crawling.data.NamedQuery;
 9 | import lt.tokenmill.crawling.data.PageableList;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
14 | import static com.vaadin.server.Sizeable.Unit.PIXELS;
15 | 
16 | public class NamedQueriesView extends BaseView {
17 | 
18 |     private static final Logger LOG = LoggerFactory.getLogger(NamedQueriesView.class);
19 | 
20 |     private Grid itemsGrid = new Grid(new GeneratedPropertyContainer(new BeanItemContainer<>(NamedQuery.class)));
21 |     private Label totalCountLabel = new Label();
22 |     private TextField filterField = new TextField();
23 | 
24 |     public NamedQueriesView() {
25 |         super("Named Queries");
26 |         HorizontalLayout mainLayout = new HorizontalLayout();
27 |         mainLayout.setWidth(100, PERCENTAGE);
28 |         mainLayout.setHeight(100, PERCENTAGE);
29 |         mainLayout.setSpacing(true);
30 | 
31 |         VerticalLayout gridLayout = new VerticalLayout();
32 |         gridLayout.setSpacing(true);
33 |         gridLayout.setWidth(100, PERCENTAGE);
34 | 
35 | 
36 |         // Search field and create new button
37 |         filterField.setInputPrompt("Enter Name...");
38 |         filterField.addTextChangeListener(event -> refreshGrid(event.getText()));
39 | 
40 |         Button addNewButton = new Button("Add New Query");
41 |         addNewButton.addClickListener(event -> showNamedQueryForm(new NamedQuery()));
42 | 
43 |         HorizontalLayout actionHeader = new HorizontalLayout(filterField, addNewButton);
44 |         actionHeader.setSpacing(true);
45 |         actionHeader.setWidth(100, PERCENTAGE);
46 |         filterField.setWidth(100, PERCENTAGE);
47 |         actionHeader.setExpandRatio(filterField, 1.0f);
48 |         gridLayout.addComponent(actionHeader);
49 | 
50 |         // Grid
51 |         itemsGrid.setWidth(100, PERCENTAGE);
52 |         itemsGrid.setHeight(700, PIXELS);
53 |         itemsGrid.setSelectionMode(Grid.SelectionMode.SINGLE);
54 |         itemsGrid.addSelectionListener(
55 |                 e -> {
56 |                     NamedQuery nq = (NamedQuery) itemsGrid.getSelectedRow();
57 |                     if (nq != null) {
58 |                         nq = ElasticSearch.getNamedQueryOperations().get(nq.getName());
59 |                         showNamedQueryForm(nq);
60 |                     }
61 |                 });
62 |         itemsGrid.setColumns("name");
63 |         gridLayout.addComponent(itemsGrid);
64 |         gridLayout.addComponent(totalCountLabel);
65 |         refreshGrid(filterField.getValue());
66 |         mainLayout.addComponent(gridLayout);
67 |         mainLayout.setExpandRatio(gridLayout, 1f);
68 |         addComponent(mainLayout);
69 |     }
70 | 
71 |     private void refreshGrid(String text) {
72 |         PageableList<NamedQuery> data = ElasticSearch.getNamedQueryOperations().filter(text);
73 |         itemsGrid.getContainerDataSource().removeAllItems();
74 |         for (NamedQuery nq : data.getItems()) {
75 |             itemsGrid.getContainerDataSource().addItem(nq);
76 |         }
77 |         totalCountLabel.setValue(String.format("Total count: %d", data.getTotalCount()));
78 |         LOG.info("Refreshed grid using filter '{}'. Total items: {}", text, data.getTotalCount());
79 |     }
80 | 
81 |     private void showNamedQueryForm(NamedQuery nq) {
82 |         NamedQueryFormWindow formWindow = new NamedQueryFormWindow(nq);
83 |         formWindow.addAfterUpdateListener(() -> refreshGrid(filterField.getValue()));
84 |         UI.getCurrent().addWindow(formWindow);
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueryResultsPanel.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view.namedquery;
 2 | 
 3 | import com.vaadin.shared.ui.label.ContentMode;
 4 | import com.vaadin.ui.Label;
 5 | import com.vaadin.ui.Panel;
 6 | import com.vaadin.ui.VerticalLayout;
 7 | import com.vaadin.ui.themes.ValoTheme;
 8 | import lt.tokenmill.crawling.data.DataUtils;
 9 | import lt.tokenmill.crawling.data.HttpArticle;
10 | import lt.tokenmill.crawling.data.PageableList;
11 | 
12 | public class NamedQueryResultsPanel extends Panel {
13 | 
14 |     public NamedQueryResultsPanel(PageableList<HttpArticle> results) {
15 |         VerticalLayout layout = new VerticalLayout();
16 |         layout.setMargin(true);
17 | 
18 |         Label countLabel = new Label(String.format("%s documents matched", results.getTotalCount()));
19 |         countLabel.addStyleName(ValoTheme.LABEL_LARGE);
20 |         countLabel.setSizeFull();
21 |         layout.addComponent(countLabel);
22 | 
23 |         for (HttpArticle article : results.getItems()) {
24 |             String labelHtml = String.format("%s&nbsp;<a href=\"%s\" target=\"_blank\">%s</a> - <strong>%s</strong>",
25 |                     DataUtils.formatInUTC(article.getPublished()), article.getUrl(), article.getTitle(), article.getSource());
26 |             Label articleLabel = new Label(labelHtml);
27 |             articleLabel.setContentMode(ContentMode.HTML);
28 |             articleLabel.setSizeFull();
29 |             layout.addComponent(articleLabel);
30 |         }
31 |         setContent(layout);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/HttpSourceAllTestsWindow.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view.sourcetest;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import com.vaadin.ui.*;
 5 | import com.vaadin.ui.themes.ValoTheme;
 6 | import lt.tokenmill.crawling.commonui.ElasticSearch;
 7 | import lt.tokenmill.crawling.data.HttpSource;
 8 | import lt.tokenmill.crawling.data.HttpSourceTest;
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import java.util.List;
13 | 
14 | public class HttpSourceAllTestsWindow extends Window {
15 | 
16 |     private static final Logger LOG = LoggerFactory.getLogger(HttpSourceAllTestsWindow.class);
17 | 
18 |     private List<Runnable> afterUpdateListeners = Lists.newArrayList();
19 | 
20 |     private Button cancelButton = new Button("Close", (event) -> this.close());
21 | 
22 |     public HttpSourceAllTestsWindow() {
23 |         setCaption("All Tests");
24 |         setModal(true);
25 |         center();
26 |         setWidth(80, Unit.PERCENTAGE);
27 |         setHeight(80, Unit.PERCENTAGE);
28 | 
29 |         VerticalLayout mainLayout = new VerticalLayout();
30 |         mainLayout.setMargin(true);
31 | 
32 |         List<HttpSourceTest> tests = ElasticSearch.getHttpSourceTestOperations().all();
33 |         for (HttpSourceTest test : tests) {
34 |             HttpSource source = ElasticSearch.getHttpSourceOperations().get(test.getSource());
35 |             if (source == null) {
36 |                 Label noSourceLabel = new Label(String.format("Source configuration '%s' not found", test.getSource()));
37 |                 noSourceLabel.addStyleName(ValoTheme.LABEL_FAILURE);
38 |                 noSourceLabel.setSizeFull();
39 |                 mainLayout.addComponent(noSourceLabel);
40 |             } else {
41 |                 mainLayout.addComponent(new TestResultsPanel(source, test));
42 |             }
43 |         }
44 | 
45 |         HorizontalLayout actions = new HorizontalLayout(cancelButton);
46 |         actions.setSpacing(true);
47 | 
48 |         setContent(mainLayout);
49 |     }
50 | }


--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/TestResultsPanel.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.view.sourcetest;
 2 | 
 3 | import com.vaadin.ui.*;
 4 | import com.vaadin.ui.themes.ValoTheme;
 5 | import lt.tokenmill.crawling.data.HttpSource;
 6 | import lt.tokenmill.crawling.data.HttpSourceTest;
 7 | import lt.tokenmill.crawling.parser.utils.HttpSourceTester;
 8 | 
 9 | import java.util.Map;
10 | 
11 | public class TestResultsPanel extends Panel {
12 | 
13 |     private Map<String, HttpSourceTester.Difference> difference;
14 | 
15 |     public TestResultsPanel(HttpSource source, HttpSourceTest test) {
16 |         this.difference = HttpSourceTester.test(source, test);
17 |         VerticalLayout layout = new VerticalLayout();
18 |         layout.setMargin(true);
19 |         if (this.difference.isEmpty()) {
20 |             Label resultLabel = new Label(String.format("'%s' Test Passed", test.getUrl()));
21 |             resultLabel.addStyleName(ValoTheme.LABEL_SUCCESS);
22 |             resultLabel.setSizeFull();
23 |             layout.addComponent(resultLabel);
24 |         } else {
25 |             Label resultLabel = new Label(String.format("'%s' Test Failed", test.getUrl()));
26 |             resultLabel.addStyleName(ValoTheme.LABEL_FAILURE);
27 |             resultLabel.setSizeFull();
28 |             layout.addComponent(resultLabel);
29 |         }
30 | 
31 |         for (Map.Entry<String, HttpSourceTester.Difference> diff : difference.entrySet()) {
32 |             HorizontalLayout fieldLayout = new HorizontalLayout();
33 |             fieldLayout.setSizeFull();
34 | 
35 |             Label resultLabel = new Label(diff.getKey());
36 |             resultLabel.addStyleName(ValoTheme.LABEL_LARGE);
37 |             fieldLayout.addComponent(resultLabel);
38 |             fieldLayout.setComponentAlignment(resultLabel, Alignment.MIDDLE_CENTER);
39 |             fieldLayout.setExpandRatio(resultLabel, 0.15f);
40 | 
41 |             FormLayout valuesLayout = new FormLayout();
42 |             valuesLayout.setWidth(100, Unit.PERCENTAGE);
43 |             valuesLayout.setSizeFull();
44 | 
45 |             TextArea expected = new TextArea("Expected");
46 |             expected.setSizeFull();
47 |             expected.setRows(2);
48 |             expected.setValue(diff.getValue().getExpected());
49 |             expected.setReadOnly(true);
50 | 
51 |             TextArea actual = new TextArea("Actual");
52 |             actual.setSizeFull();
53 |             actual.setRows(2);
54 |             actual.setValue(diff.getValue().getActual());
55 |             actual.setReadOnly(true);
56 | 
57 |             valuesLayout.addComponents(expected, actual);
58 | 
59 |             fieldLayout.addComponent(valuesLayout);
60 |             fieldLayout.setExpandRatio(valuesLayout, 0.85f);
61 | 
62 |             layout.addComponent(fieldLayout);
63 |         }
64 | 
65 |         setContent(layout);
66 |     }
67 | 
68 |     public boolean passed() {
69 |         return difference != null && difference.isEmpty();
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=DEBUG, stdout
 2 | 
 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.stdout.Target=System.out
 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n
 7 | 
 8 | 
 9 | log4j.logger.org.apache=INFO
10 | log4j.logger.org.eclipse.jetty=INFO
11 | log4j.logger.org.elasticsearch=INFO


--------------------------------------------------------------------------------
/administration-ui/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/crawling-framework/987100fee5965b43e178c9096ab3b2aa3a11fac7/administration-ui/src/main/resources/log4j2.properties


--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/addons.scss:
--------------------------------------------------------------------------------
1 | /* This file is automatically managed and will be overwritten from time to time. */
2 | /* Do not manually edit this file. */
3 | 
4 | /* Import and include this mixin into your project theme to include the addon themes */
5 | @mixin addons {
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/crawleradmintheme.scss:
--------------------------------------------------------------------------------
 1 | // If you edit this file you need to compile the theme. See README.md for details.
 2 | // Global variable overrides. Must be declared before importing Valo.
 3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing.
 4 | 
 5 | //$v-font-size: 16px;
 6 | //$v-font-weight: 300;
 7 | //$v-font-family: "Open Sans", sans-serif;
 8 | 
 9 | // Defines the border used by all components.
10 | //$v-border: 1px solid (v-shade 0.7);
11 | //$v-border-radius: 4px;
12 | 
13 | // Affects the color of some component elements, e.g Button, Panel title, etc
14 | //$v-background-color: hsl(210, 0%, 98%);
15 | 
16 | // Affects the color of content areas, e.g  Panel and Window content, TextField input etc
17 | //$v-app-background-color: $v-background-color;
18 | 
19 | // Affects the visual appearance of all components
20 | //$v-gradient: v-linear 8%;
21 | //$v-bevel-depth: 30%;
22 | //$v-shadow-opacity: 5%;
23 | 
24 | // Defines colors for indicating status (focus, success, failure)
25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically
26 | //$v-friendly-color: #2c9720;
27 | //$v-error-indicator-color: #ed473b;
28 | 
29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html
30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples
31 | 
32 | @import "../valo/valo.scss";
33 | 
34 | @mixin crawleradmintheme {
35 |   @include valo;
36 | 
37 |   // Insert your own theme rules here
38 | }


--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/styles.scss:
--------------------------------------------------------------------------------
 1 | @import "crawleradmintheme.scss";
 2 | @import "addons.scss";
 3 | 
 4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes.
 5 | // The actual styles should be defined in crawleradmintheme.scss
 6 | 
 7 | .crawleradmintheme {
 8 |   @include addons;
 9 |   @include crawleradmintheme;
10 | }


--------------------------------------------------------------------------------
/administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import com.google.common.base.Charsets;
 4 | import com.google.common.io.Resources;
 5 | import lt.tokenmill.crawling.data.HttpSourceTest;
 6 | import org.junit.Test;
 7 | 
 8 | import java.net.URL;
 9 | import java.time.Instant;
10 | import java.util.Map;
11 | 
12 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceTestCSVUtils.CSV_COLUMNS;
13 | import static org.junit.Assert.assertEquals;
14 | 
15 | public class HttpSourceTestCSVUtilsTest {
16 | 
17 |     protected String loadHtml(String name) throws Exception {
18 |         URL htmlResource = Resources.getResource(name + ".html");
19 |         return Resources.toString(htmlResource, Charsets.UTF_8);
20 |     }
21 | 
22 |     @Test
23 |     public void testHttpSourceTestToCsvAndBack() throws Exception {
24 |         HttpSourceTest httpSourceTest = new HttpSourceTest();
25 |         httpSourceTest.setUrl("http://www.tokenmill.lt/");
26 |         httpSourceTest.setSource("http://www.tokenmill.lt/");
27 |         httpSourceTest.setHtml(loadHtml("www.tokenmill.lt"));
28 |         httpSourceTest.setUrlAccepted(true);
29 |         httpSourceTest.setTitle("TokenMill");
30 |         httpSourceTest.setText("Some text");
31 |         httpSourceTest.setDate(Instant.now().toString());
32 | 
33 |         String[] csvRow = HttpSourceTestCSVUtils.mapHttpSourceTestToCsvRow(httpSourceTest);
34 |         String[] headerLine = CSV_COLUMNS;
35 |         Map<String, Integer> columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS);
36 |         HttpSourceTest fromRow = HttpSourceTestCSVUtils.mapCsvRowToHttpSourceTest(csvRow, columnIndexes);
37 |         assertEquals(httpSourceTest.getUrl(), fromRow.getUrl());
38 |         assertEquals(httpSourceTest.getSource(), fromRow.getSource());
39 |         assertEquals(httpSourceTest.getHtml(), fromRow.getHtml());
40 |         assertEquals(httpSourceTest.getUrlAccepted(), fromRow.getUrlAccepted());
41 |         assertEquals(httpSourceTest.getTitle(), fromRow.getTitle());
42 |         assertEquals(httpSourceTest.getText(), fromRow.getText());
43 |         assertEquals(httpSourceTest.getDate(), fromRow.getDate());
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourcesCSVUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.adminui.utils;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpSource;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.Map;
 8 | 
 9 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceCSVUtils.CSV_COLUMNS;
10 | import static org.junit.Assert.assertEquals;
11 | 
12 | public class HttpSourcesCSVUtilsTest {
13 | 
14 |     @Test
15 |     public void testHttpSourcesToCsvAndBack() {
16 |         HttpSource source = new HttpSource();
17 |         source.setUrl("url");
18 |         source.setName("name");
19 |         source.setLanguage("language");
20 |         source.setTimezone("timezone");
21 |         source.setEnabled(true);
22 |         source.setDiscoveryEnabled(true);
23 |         source.setUrlRecrawlDelayInSecs(1);
24 |         source.setFeedRecrawlDelayInSecs(1);
25 |         source.setSitemapRecrawlDelayInSecs(1);
26 |         source.setUrls(Arrays.asList("url1", "url2"));
27 |         source.setFeeds(Arrays.asList("feed1", "feed2"));
28 |         source.setSitemaps(Arrays.asList("sitemap1", "sitemap2"));
29 |         source.setCategories(Arrays.asList("cat1", "cat2"));
30 |         source.setAppIds(Arrays.asList("app1", "app2"));
31 |         source.setUrlFilters(Arrays.asList("f1", "f2"));
32 |         source.setUrlNormalizers(Arrays.asList("n1", "n2"));
33 |         source.setTitleSelectors(Arrays.asList("ts1", "ts2"));
34 |         source.setTextSelectors(Arrays.asList("ts1", "ts2"));
35 |         source.setTextNormalizers(Arrays.asList("tn1", "tn2"));
36 |         source.setDateSelectors(Arrays.asList("ds1", "ds2"));
37 |         source.setDateRegexps(Arrays.asList("dr1", "dr2"));
38 |         source.setDateFormats(Arrays.asList("df1", "df2"));
39 | 
40 |         String[] row = HttpSourceCSVUtils.mapHttpSourceToCsvRow(source);
41 |         String[] headerLine = CSV_COLUMNS;
42 |         Map<String, Integer> columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS);
43 |         HttpSource fromRow = HttpSourceCSVUtils.mapCsvRowToHttpSource(row, columnIndexes);
44 |         assertEquals(source, fromRow);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/analysis-ui/conf/development.properties:
--------------------------------------------------------------------------------
 1 | port=8080
 2 | es.hostname=localhost
 3 | es.transport.port=9300
 4 | es.httpsource.index.name=http_sources
 5 | es.httpsource.doc.type=http_source
 6 | es.httpsourcetest.index.name=http_source_tests
 7 | es.httpsourcetest.doc.type=http_source_test
 8 | es.namedqueries.index.name=named_queries
 9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc


--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/AnalysisUI.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.analysisui;
 2 | 
 3 | import com.vaadin.annotations.Theme;
 4 | import com.vaadin.annotations.VaadinServletConfiguration;
 5 | import com.vaadin.server.VaadinRequest;
 6 | import com.vaadin.server.VaadinServlet;
 7 | import com.vaadin.ui.UI;
 8 | import lt.tokenmill.crawling.analysisui.view.SearchView;
 9 | 
10 | import javax.servlet.annotation.WebServlet;
11 | 
12 | @Theme("analysistheme")
13 | public class AnalysisUI extends UI {
14 | 
15 |     @Override
16 |     protected void init(VaadinRequest vaadinRequest) {
17 |         setContent(new SearchView());
18 |     }
19 | 
20 |     @WebServlet(urlPatterns = "/*", name = "AnalysisUIServlet", asyncSupported = true)
21 |     @VaadinServletConfiguration(ui = AnalysisUI.class, productionMode = false)
22 |     public static class AnalysisUIServlet extends VaadinServlet {
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/Application.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.analysisui;
 2 | 
 3 | import com.vaadin.server.VaadinServlet;
 4 | import lt.tokenmill.crawling.commonui.Configuration;
 5 | import org.eclipse.jetty.server.Server;
 6 | import org.eclipse.jetty.servlet.ServletContextHandler;
 7 | import org.eclipse.jetty.servlet.ServletHolder;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | 
12 | public class Application {
13 | 
14 |     private static final Logger LOG = LoggerFactory.getLogger(Application.class);
15 |     private static final Boolean PRODUCTION_MODE = true;
16 | 
17 |     public static void main(String[] args) {
18 |         int port = Configuration.INSTANCE.getInt("port", 8080);
19 |         Server server = new Server(port);
20 |         ServletContextHandler contextHandler
21 |                 = new ServletContextHandler(ServletContextHandler.SESSIONS);
22 |         contextHandler.setContextPath("/");
23 |         ServletHolder sh = new ServletHolder(new VaadinServlet());
24 |         contextHandler.addServlet(sh, "/*");
25 |         contextHandler.setInitParameter("ui", AnalysisUI.class.getCanonicalName());
26 |         contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE));
27 |         server.setHandler(contextHandler);
28 |         try {
29 |             server.start();
30 |             server.join();
31 |         } catch (Exception e) {
32 |             LOG.error("Failed to start application", e);
33 |         }
34 |     }
35 | }


--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/search/ResultPanel.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.analysisui.search;
 2 | 
 3 | import com.vaadin.shared.ui.label.ContentMode;
 4 | import com.vaadin.ui.Label;
 5 | import com.vaadin.ui.Panel;
 6 | import com.vaadin.ui.VerticalLayout;
 7 | import lt.tokenmill.crawling.data.DataUtils;
 8 | import lt.tokenmill.crawling.data.HighlightedSearchResult;
 9 | import lt.tokenmill.crawling.data.HttpArticle;
10 | 
11 | import java.util.stream.Collectors;
12 | 
13 | public class ResultPanel extends Panel {
14 | 
15 |     private static final String RESULTS_TEMPLATE = "<b>%s</b>&nbsp;<a href=\"%s\" target=\"_blank\"><strong>%s</strong></a>&nbsp;<b>•</b>&nbsp;%s<br/>%s";
16 | 
17 |     public ResultPanel(HighlightedSearchResult searchResult) {
18 |         HttpArticle article  = searchResult.getArticle();
19 |         String highlights = searchResult.getHighlights().stream().collect(Collectors.joining("<br/>...<br/>"));
20 |         String text = String.format(RESULTS_TEMPLATE,
21 |                 DataUtils.formatInUTC(article.getPublished()).replace("T", " "),
22 |                 article.getUrl(), article.getTitle(), article.getSource(), highlights);
23 |         Label content = new Label(text);
24 |         content.setContentMode(ContentMode.HTML);
25 |         VerticalLayout component = new VerticalLayout(content);
26 |         component.setMargin(true);
27 |         setContent(component);
28 |     }
29 | 
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/BaseView.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.analysisui.view;
 2 | 
 3 | 
 4 | import com.vaadin.ui.HorizontalLayout;
 5 | import com.vaadin.ui.MenuBar;
 6 | import com.vaadin.ui.UI;
 7 | import com.vaadin.ui.VerticalLayout;
 8 | 
 9 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
10 | 
11 | public class BaseView extends VerticalLayout {
12 | 
13 |     public BaseView(String title) {
14 |         UI.getCurrent().getPage().setTitle(String.format("Analysis | %s", title));
15 |         setWidth(100, PERCENTAGE);
16 |         setSpacing(true);
17 |         setMargin(true);
18 | 
19 |         HorizontalLayout actionBarLayout = new HorizontalLayout();
20 |         actionBarLayout.setWidth(100, PERCENTAGE);
21 | 
22 |         MenuBar menu = new MenuBar();
23 | 
24 |         menu.addItem("Search", (item) -> UI.getCurrent().setContent(new SearchView()));
25 |         menu.addItem("Context Cloud", (item) -> UI.getCurrent().setContent(new ContextCloudView()));
26 | 
27 |         actionBarLayout.addComponent(menu);
28 | 
29 |         addComponent(actionBarLayout);
30 |     }
31 | 
32 | }


--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/SearchView.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.analysisui.view;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import com.vaadin.ui.*;
 5 | import lt.tokenmill.crawling.analysisui.search.ResultPanel;
 6 | import lt.tokenmill.crawling.commonui.ElasticSearch;
 7 | import lt.tokenmill.crawling.data.HighlightedSearchResult;
 8 | import lt.tokenmill.crawling.data.NamedQuery;
 9 | import lt.tokenmill.crawling.data.PageableList;
10 | import lt.tokenmill.crawling.parser.utils.QueryParser;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | import java.util.List;
15 | import java.util.stream.Collectors;
16 | 
17 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
18 | 
19 | public class SearchView extends BaseView {
20 | 
21 |     private static final Logger LOG = LoggerFactory.getLogger(SearchView.class);
22 | 
23 |     private TextField filterField = new TextField();
24 |     private Label queryDescriptionLabel = new Label();
25 |     private VerticalLayout resultLayout = new VerticalLayout();
26 | 
27 |     public SearchView() {
28 |         super("Search");
29 |         Button searchButton = new Button("Search");
30 |         searchButton.addClickListener(event -> search());
31 | 
32 |         VerticalLayout searchLayout = new VerticalLayout();
33 |         searchLayout.setSpacing(true);
34 |         searchLayout.setWidth(50, PERCENTAGE);
35 | 
36 |         HorizontalLayout actionHeader = new HorizontalLayout(filterField, searchButton);
37 |         actionHeader.setSpacing(true);
38 |         actionHeader.setWidth(100, PERCENTAGE);
39 |         actionHeader.setExpandRatio(filterField, 1.0f);
40 |         filterField.setWidth(100, PERCENTAGE);
41 | 
42 |         searchLayout.addComponent(actionHeader);
43 |         searchLayout.addComponent(queryDescriptionLabel);
44 | 
45 |         addComponent(searchLayout);
46 |         setComponentAlignment(searchLayout, Alignment.TOP_CENTER);
47 | 
48 |         resultLayout.setWidth(80, PERCENTAGE);
49 |         resultLayout.setSpacing(true);
50 | 
51 |         addComponent(resultLayout);
52 |         setComponentAlignment(resultLayout, Alignment.TOP_CENTER);
53 | 
54 |     }
55 | 
56 |     private void search() {
57 |         resultLayout.removeAllComponents();
58 |         List<String> query = QueryParser.parseQuery(filterField.getValue());
59 |         LOG.info("Parsed '{}' from query '{}'", query, filterField.getValue());
60 |         List<NamedQuery> includedNamed = Lists.newArrayList();
61 |         List<NamedQuery> excludedNamed = Lists.newArrayList();
62 |         StringBuilder additionalQuery = new StringBuilder();
63 |         for (String q : query) {
64 |             boolean excluded = q.startsWith("-");
65 |             String name = q.replaceAll("^[+-]+", "");
66 |             NamedQuery namedQuery = ElasticSearch.getNamedQueryOperations().get(name);
67 |             if (namedQuery != null && excluded) {
68 |                 excludedNamed.add(namedQuery);
69 |                 LOG.info("Named query '{}' is negative", namedQuery.getName());
70 |             } else if (namedQuery != null) {
71 |                 includedNamed.add(namedQuery);
72 |                 LOG.info("Named query '{}' is positive", namedQuery.getName());
73 |             } else {
74 |                 additionalQuery.append(" ").append(q);
75 |             }
76 |         }
77 |         LOG.info("Additional query: '{}'", additionalQuery.toString().trim());
78 |         PageableList<HighlightedSearchResult> result = ElasticSearch.getDocumentOperations().query(includedNamed, excludedNamed, additionalQuery.toString().trim());
79 |         List<NamedQuery> namedQueries = Lists.newArrayList(includedNamed);
80 |         namedQueries.addAll(excludedNamed);
81 | 
82 |         queryDescriptionLabel.setValue(String.format("Named Queries: %s, Additional Query: '%s'",
83 |                 namedQueries.stream().map(NamedQuery::getName).collect(Collectors.joining("', '", "'", "'")),
84 |                 additionalQuery.toString().trim()));
85 | 
86 |         for (HighlightedSearchResult r : result.getItems()) {
87 |             resultLayout.addComponent(new ResultPanel(r));
88 |         }
89 | 
90 | 
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/analysis-ui/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=DEBUG, stdout
 2 | 
 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.stdout.Target=System.out
 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n
 7 | 
 8 | 
 9 | log4j.logger.org.apache=INFO
10 | log4j.logger.org.eclipse.jetty=INFO
11 | log4j.logger.org.elasticsearch=INFO


--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/addons.scss:
--------------------------------------------------------------------------------
1 | /* This file is automatically managed and will be overwritten from time to time. */
2 | /* Do not manually edit this file. */
3 | 
4 | /* Import and include this mixin into your project theme to include the addon themes */
5 | @mixin addons {
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/analysistheme.scss:
--------------------------------------------------------------------------------
 1 | // If you edit this file you need to compile the theme. See README.md for details.
 2 | // Global variable overrides. Must be declared before importing Valo.
 3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing.
 4 | 
 5 | //$v-font-size: 16px;
 6 | //$v-font-weight: 300;
 7 | //$v-font-family: "Open Sans", sans-serif;
 8 | 
 9 | // Defines the border used by all components.
10 | //$v-border: 1px solid (v-shade 0.7);
11 | //$v-border-radius: 4px;
12 | 
13 | // Affects the color of some component elements, e.g Button, Panel title, etc
14 | //$v-background-color: hsl(210, 0%, 98%);
15 | 
16 | // Affects the color of content areas, e.g  Panel and Window content, TextField input etc
17 | //$v-app-background-color: $v-background-color;
18 | 
19 | // Affects the visual appearance of all components
20 | //$v-gradient: v-linear 8%;
21 | //$v-bevel-depth: 30%;
22 | //$v-shadow-opacity: 5%;
23 | 
24 | // Defines colors for indicating status (focus, success, failure)
25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically
26 | //$v-friendly-color: #2c9720;
27 | //$v-error-indicator-color: #ed473b;
28 | 
29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html
30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples
31 | 
32 | @import "../valo/valo.scss";
33 | 
34 | @mixin analysistheme {
35 |   @include valo;
36 | 
37 |   // Insert your own theme rules here
38 | }


--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/styles.scss:
--------------------------------------------------------------------------------
 1 | @import "analysistheme.scss";
 2 | @import "addons.scss";
 3 | 
 4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes.
 5 | // The actual styles should be defined in analysisheme.scss
 6 | 
 7 | .analysistheme {
 8 |   @include addons;
 9 |   @include analysistheme;
10 | }


--------------------------------------------------------------------------------
/bin/create-es-index.sh:
--------------------------------------------------------------------------------
 1 | # $1 - index name (docs, http_sources)
 2 | # $2 - ES index config file name
 3 | # $3 - ES host
 4 | # $4 - application name
 5 | 
 6 | if [ -z "$4" ]
 7 | then
 8 |       export INDEX_URL="http://$3:9200/$1_v1"
 9 | else
10 |       export INDEX_URL="http://$3:9200/$4-$1_v1"
11 | fi
12 | 
13 | 
14 | curl -H "Content-Type:application/json" -XDELETE "$INDEX_URL"
15 | echo
16 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL" -d @elasticsearch/src/main/resources/indices/$2
17 | echo
18 | if [ -z "$4" ]
19 | then
20 |       curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$1"
21 |       echo
22 | else
23 |       curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$4-$1"
24 |       echo
25 | fi
26 | 


--------------------------------------------------------------------------------
/bin/create-es-indices.sh:
--------------------------------------------------------------------------------
1 | bin/create-es-index.sh docs document.json ${1:-localhost} $2
2 | bin/create-es-index.sh named_queries query.json ${1:-localhost} $2
3 | bin/create-es-index.sh http_sources http_source.json ${1:-localhost} $2
4 | bin/create-es-index.sh http_source_tests http_source_test.json ${1:-localhost} $2
5 | bin/create-es-index.sh urls url.json ${1:-localhost} $2
6 | 


--------------------------------------------------------------------------------
/bin/deploy-crawler.sh:
--------------------------------------------------------------------------------
1 | STORM_HOME=/opt/storm/apache-storm-1.1.1
2 | mvn clean install -Pbigjar -Dstorm.scope=provided
3 | $STORM_HOME/bin/storm jar crawler/target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -conf crawler/conf/local.yaml
4 | 


--------------------------------------------------------------------------------
/bin/run-administration-ui.sh:
--------------------------------------------------------------------------------
1 | ( cd administration-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/administration-ui-standalone.jar )
2 | 


--------------------------------------------------------------------------------
/bin/run-analysis-ui.sh:
--------------------------------------------------------------------------------
1 | ( cd analysis-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/analysis-ui-standalone.jar )


--------------------------------------------------------------------------------
/bin/run-crawler.sh:
--------------------------------------------------------------------------------
1 | ( cd crawler && mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests && java -cp target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -local -conf conf/local.yaml )
2 | 


--------------------------------------------------------------------------------
/crawler/conf/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | fetcher.server.delay: 4.5
 2 | fetcher.server.min.delay: 3.0
 3 | fetcher.queue.mode: "byHost"
 4 | fetcher.threads.per.queue: 1
 5 | fetcher.threads.number: 5
 6 | 
 7 | partition.url.mode: "byHost"
 8 | 
 9 | metadata.track.path: false
10 | metadata.track.depth: false
11 | metadata.transfer:
12 |  - "source"
13 | 
14 | http.agent.name: "NewsRadar"
15 | http.agent.version: "1.0"
16 | http.agent.description: "News Crawler"
17 | http.agent.url: ""
18 | http.agent.email: ""
19 | 
20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
22 | http.content.limit: 1048576
23 | http.store.responsetime: false
24 | http.timeout: 30000
25 | 
26 | http.robots.403.allow: true
27 | 
28 | protocols: "http,https"
29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
31 | 
32 | urlfilters.config.file: "urlfilters.json"
33 | 
34 | # revisit a page monthly (value in minutes)
35 | fetchInterval.default: 44640
36 | 
37 | # revisit a page with a fetch error after 2 hours (value in minutes)
38 | fetchInterval.fetch.error: 120
39 | 
40 | # revisit a page with an error every month (value in minutes)
41 | fetchInterval.error: 44640
42 | 
43 | # Default implementation of Scheduler
44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
45 | 
46 | topology.workers: 1
47 | topology.sleep.spout.wait.strategy.time.ms: 5000
48 | topology.message.timeout.secs: 300
49 | topology.max.spout.pending: 100
50 | topology.debug: false
51 | 
52 | # ElasticSearch configuration
53 | es.hostname: "elasticsearch"
54 | es.rest.port: 9200
55 | 
56 | es.urls.index.name: "urls"
57 | es.urls.doc.type: "url"
58 | es.docs.index.name: "docs"
59 | es.docs.doc.type: "doc"
60 | es.httpsource.index.name: "http_sources"
61 | es.httpsource.doc.type: "http_source"
62 | 
63 | # MetricsConsumer configuration
64 | es.metrics.addresses: "elasticsearch:9300"
65 | es.metrics.index.name: "metrics"
66 | es.metrics.doc.type: "datapoint"
67 | es.metrics.cluster.name: "elasticsearch"
68 | es.metrics.blacklist:
69 |   - "__"
70 |   - "uptime"
71 |   - "memory"
72 |   - "GC"
73 |   - "newWorkerEvent"
74 |   - "startTimeSecs"
75 | 


--------------------------------------------------------------------------------
/crawler/conf/local.yaml:
--------------------------------------------------------------------------------
 1 | fetcher.server.delay: 4.5
 2 | fetcher.server.min.delay: 3.0
 3 | fetcher.queue.mode: "byHost"
 4 | fetcher.threads.per.queue: 1
 5 | fetcher.threads.number: 5
 6 | 
 7 | partition.url.mode: "byHost"
 8 | 
 9 | metadata.track.path: false
10 | metadata.track.depth: false
11 | metadata.transfer:
12 |  - "source"
13 | 
14 | http.agent.name: "NewsRadar"
15 | http.agent.version: "1.0"
16 | http.agent.description: "News Crawler"
17 | http.agent.url: ""
18 | http.agent.email: ""
19 | 
20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
22 | http.content.limit: 1048576
23 | http.store.responsetime: false
24 | http.timeout: 30000
25 | 
26 | http.robots.403.allow: true
27 | 
28 | protocols: "http,https"
29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
31 | 
32 | urlfilters.config.file: "urlfilters.json"
33 | 
34 | # revisit a page monthly (value in minutes)
35 | fetchInterval.default: 44640
36 | 
37 | # revisit a page with a fetch error after 2 hours (value in minutes)
38 | fetchInterval.fetch.error: 120
39 | 
40 | # revisit a page with an error every month (value in minutes)
41 | fetchInterval.error: 44640
42 | 
43 | # Default implementation of Scheduler
44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
45 | 
46 | topology.workers: 1
47 | topology.sleep.spout.wait.strategy.time.ms: 5000
48 | topology.message.timeout.secs: 300
49 | topology.max.spout.pending: 100
50 | topology.debug: false
51 | 
52 | # ElasticSearch configuration
53 | es.hostname: "localhost"
54 | es.rest.port: 9200
55 | 
56 | es.urls.index.name: "urls"
57 | es.urls.doc.type: "url"
58 | es.docs.index.name: "docs"
59 | es.docs.doc.type: "doc"
60 | es.httpsource.index.name: "http_sources"
61 | es.httpsource.doc.type: "http_source"
62 | 
63 | # MetricsConsumer configuration
64 | es.metrics.addresses: "localhost:9300"
65 | es.metrics.index.name: "metrics"
66 | es.metrics.doc.type: "datapoint"
67 | es.metrics.cluster.name: "elasticsearch"
68 | es.metrics.blacklist:
69 |   - "__"
70 |   - "uptime"
71 |   - "memory"
72 |   - "GC"
73 |   - "newWorkerEvent"
74 |   - "startTimeSecs"
75 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerConstants.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler;
 2 | 
 3 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
 4 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import java.util.concurrent.TimeUnit;
 9 | 
10 | public class CrawlerConstants {
11 |     private static final Logger LOG = LoggerFactory.getLogger(CrawlerConstants.class);
12 | 
13 |     private static long getReloadDelayInSeconds() {
14 |         long reloadDelay = 300;
15 |         String envVar = System.getenv("DEFAULT_SOURCE_RELOAD_DELAY");
16 |         if (envVar != null) {
17 |             try {
18 |                 reloadDelay = Long.parseLong(envVar);
19 |             } catch (NumberFormatException e) {
20 |                 LOG.warn("Environment variable 'DEFAULT_SOURCE_RELOAD_DELAY' is not a number '{}'", envVar);
21 |             }
22 |         }
23 |         return  reloadDelay;
24 |     }
25 | 
26 |     public static final long MIN_FETCH_DELAY = TimeUnit.MINUTES.toMillis(1);
27 |     public static final long DEFAULT_URL_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10);
28 |     public static final long DEFAULT_FEED_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10);
29 |     public static final long DEFAULT_SITEMAP_FETCH_DELAY = TimeUnit.MINUTES.toMillis(30);
30 |     public static final long DEFAULT_SOURCE_RELOAD_DELAY = TimeUnit.SECONDS.toMillis(getReloadDelayInSeconds());
31 | 
32 |     public static final String META_IS_SITEMAP = SiteMapParserBolt.isSitemapKey;
33 |     public static final String META_IS_FEED = FeedParserBolt.isFeedKey;
34 |     public static final String META_IS_SEED = "isSeed";
35 |     public static final String META_SOURCE = "source";
36 |     public static final String META_PUBLISHED = "published";
37 |     public static final String META_DISCOVERED = "discovered";
38 |     public static final String META_FEED_PUBLISHED = "feed.publishedDate";
39 | 
40 |     public static final String URL_FILTERS_FILE = "urlfilters.config.file";
41 | 
42 |     public static final String PARTIAL_ANALYSIS_STATUS = "PARTIAL_ANALYSIS";
43 | }
44 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerTopology.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler;
 2 | 
 3 | import com.digitalpebble.stormcrawler.ConfigurableTopology;
 4 | import com.digitalpebble.stormcrawler.Constants;
 5 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
 6 | import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
 7 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
 8 | import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
 9 | import lt.tokenmill.crawling.crawler.bolt.ArticleIndexerBolt;
10 | import lt.tokenmill.crawling.crawler.bolt.LinkExtractorBolt;
11 | import lt.tokenmill.crawling.crawler.bolt.StatusUpdaterBolt;
12 | import lt.tokenmill.crawling.crawler.spout.UrlGeneratorSpout;
13 | import org.apache.storm.Config;
14 | import org.apache.storm.topology.IRichBolt;
15 | import org.apache.storm.topology.IRichSpout;
16 | import org.apache.storm.topology.TopologyBuilder;
17 | import org.apache.storm.tuple.Fields;
18 | 
19 | public class CrawlerTopology extends ConfigurableTopology {
20 | 
21 |     private final ServiceProvider serviceProvider;
22 | 
23 |     public static void main(String[] args) throws Exception {
24 |         ConfigurableTopology.start(new CrawlerTopology(), args);
25 |     }
26 | 
27 |     public CrawlerTopology() {
28 |         this(new DefaultServiceProvider());
29 |     }
30 | 
31 |     public CrawlerTopology(ServiceProvider serviceProvider) {
32 |         this.serviceProvider = serviceProvider;
33 |     }
34 | 
35 |     @Override
36 |     protected int run(String[] strings) {
37 |         TopologyBuilder builder = new TopologyBuilder();
38 | 
39 |         builder.setSpout("generator", createUrlGeneratorSpout(serviceProvider));
40 | 
41 |         builder.setBolt("partitioner", new URLPartitionerBolt())
42 |                 .shuffleGrouping("generator");
43 | 
44 |         builder.setBolt("fetch", new FetcherBolt())
45 |                 .fieldsGrouping("partitioner", new Fields("key"));
46 | 
47 |         builder.setBolt("sitemap", new SiteMapParserBolt())
48 |                 .localOrShuffleGrouping("fetch");
49 | 
50 |         builder.setBolt("feed", new FeedParserBolt())
51 |                 .localOrShuffleGrouping("sitemap");
52 | 
53 |         builder.setBolt("links", createLinkExtractor(serviceProvider))
54 |                 .localOrShuffleGrouping("feed");
55 | 
56 |         builder.setBolt("index", createArticleIndexer(serviceProvider))
57 |                 .localOrShuffleGrouping("fetch");
58 | 
59 |         builder.setBolt("status", createStatusUpdater(serviceProvider))
60 |                 .localOrShuffleGrouping("fetch", Constants.StatusStreamName)
61 |                 .localOrShuffleGrouping("sitemap", Constants.StatusStreamName)
62 |                 .localOrShuffleGrouping("index", Constants.StatusStreamName)
63 |                 .localOrShuffleGrouping("links", Constants.StatusStreamName);
64 | 
65 |         String topologyName = (String) conf.getOrDefault(Config.TOPOLOGY_NAME, "crawler");
66 |         System.setProperty("es.set.netty.runtime.available.processors", "false");
67 |         return submit(topologyName, conf, builder);
68 |     }
69 | 
70 |     protected IRichSpout createUrlGeneratorSpout(ServiceProvider serviceProvider) {
71 |         return new UrlGeneratorSpout(serviceProvider);
72 |     }
73 | 
74 |     protected IRichBolt createLinkExtractor(ServiceProvider serviceProvider) {
75 |         return new LinkExtractorBolt(serviceProvider);
76 |     }
77 | 
78 |     protected IRichBolt createArticleIndexer(ServiceProvider serviceProvider) {
79 |         return new ArticleIndexerBolt(serviceProvider);
80 |     }
81 | 
82 |     protected IRichBolt createStatusUpdater(ServiceProvider serviceProvider) {
83 |         return new StatusUpdaterBolt(serviceProvider);
84 |     }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler;
 2 | 
 3 | import com.digitalpebble.stormcrawler.util.ConfUtils;
 4 | import com.google.common.collect.Maps;
 5 | import lt.tokenmill.crawling.es.*;
 6 | import org.slf4j.Logger;
 7 | import org.slf4j.LoggerFactory;
 8 | 
 9 | import java.io.Serializable;
10 | import java.util.Map;
11 | 
12 | public class DefaultServiceProvider implements ServiceProvider, Serializable {
13 | 
14 |     private static final Logger LOG = LoggerFactory.getLogger(DefaultServiceProvider.class);
15 | 
16 |     private static final Map<String, ElasticConnection> ES_CONNECTIONS = Maps.newConcurrentMap();
17 | 
18 |     public static ElasticConnection getElasticConnection(Map conf) {
19 |         String hostname = ConfUtils.getString(conf, ElasticConstants.ES_HOSTNAME_PARAM);
20 |         int restPort = ConfUtils.getInt(conf, ElasticConstants.ES_REST_PORT, 9200);
21 |         String restScheme =  ConfUtils.getString(conf, ElasticConstants.ES_REST_SCHEME, "http");
22 |         if (ES_CONNECTIONS.containsKey(hostname)) {
23 |             return ES_CONNECTIONS.get(hostname);
24 |         } else {
25 |             ElasticConnection elasticConnection = ElasticConnection.getConnection(hostname, restPort, restScheme);
26 |             ES_CONNECTIONS.put(hostname, elasticConnection);
27 |             return ES_CONNECTIONS.get(hostname);
28 |         }
29 |     }
30 | 
31 |     public EsHttpUrlOperations createEsHttpUrlOperations(Map conf) {
32 |         ElasticConnection connection = getElasticConnection(conf);
33 |         String urlsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_URLS_INDEX_NAME_PARAM);
34 |         String urlsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_URLS_DOC_TYPE_PARAM);
35 |         return EsHttpUrlOperations.getInstance(connection, urlsIndexName, urlsDocumentType);
36 |     }
37 | 
38 |     public EsHttpSourceOperations createEsHttpSourceOperations(Map conf) {
39 |         ElasticConnection connection = getElasticConnection(conf);
40 |         String sourcesIndexName = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM);
41 |         String sourcesDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM);
42 |         return EsHttpSourceOperations.getInstance(connection, sourcesIndexName, sourcesDocumentType);
43 |     }
44 | 
45 |     public EsDocumentOperations creatEsDocumentOperations(Map conf) {
46 |         ElasticConnection connection = getElasticConnection(conf);
47 |         String docsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_INDEX_NAME_PARAM);
48 |         String docsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_DOC_TYPE_PARAM);
49 |         return EsDocumentOperations.getInstance(connection, docsIndexName, docsDocumentType);
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler;
 2 | 
 3 | import lt.tokenmill.crawling.es.EsDocumentOperations;
 4 | import lt.tokenmill.crawling.es.EsHttpSourceOperations;
 5 | import lt.tokenmill.crawling.es.EsHttpUrlOperations;
 6 | 
 7 | import java.util.Map;
 8 | 
 9 | /***
10 |  * Interface for external service factory.
11 |  */
12 | public interface ServiceProvider {
13 | 
14 |     EsHttpUrlOperations createEsHttpUrlOperations(Map conf);
15 | 
16 |     EsHttpSourceOperations createEsHttpSourceOperations(Map conf);
17 | 
18 |     EsDocumentOperations creatEsDocumentOperations(Map conf);
19 | }
20 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.bolt;
 2 | 
 3 | import com.digitalpebble.stormcrawler.Metadata;
 4 | import com.digitalpebble.stormcrawler.persistence.AbstractStatusUpdaterBolt;
 5 | import com.digitalpebble.stormcrawler.persistence.Status;
 6 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
 7 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider;
 8 | import lt.tokenmill.crawling.crawler.ServiceProvider;
 9 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
10 | import lt.tokenmill.crawling.data.HttpSource;
11 | import lt.tokenmill.crawling.es.*;
12 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
13 | import org.apache.storm.metric.api.MultiCountMetric;
14 | import org.apache.storm.task.OutputCollector;
15 | import org.apache.storm.task.TopologyContext;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 | 
19 | import java.util.Date;
20 | import java.util.Map;
21 | 
22 | public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt {
23 | 
24 |     private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class);
25 | 
26 |     private MultiCountMetric eventCounter;
27 | 
28 |     private EsHttpUrlOperations esUrlsOperations;
29 |     private EsHttpSourceOperations esHttpSourcesOperations;
30 |     private ServiceProvider serviceProvider;
31 | 
32 |     public StatusUpdaterBolt(ServiceProvider serviceProvider) {
33 |         this.serviceProvider = serviceProvider;
34 |     }
35 | 
36 |     @Override
37 |     public void store(String url, Status status, Metadata metadata, Date nextFetch) throws Exception {
38 |         try {
39 |             String source = metadata.getFirstValue(CrawlerConstants.META_SOURCE);
40 |             Boolean isSeed = Boolean.parseBoolean(metadata.getFirstValue(CrawlerConstants.META_IS_SEED));
41 |             HttpSource httpSource = EsHttpSourcesCache.get(esHttpSourcesOperations, source);
42 |             UrlFilters filters = UrlFiltersCache.get(httpSource);
43 | 
44 |             String filtered = filters.filter(url);
45 |             if (isSeed || (filtered == null && status.equals(Status.DISCOVERED))) {
46 |                 LOG.debug("Url '{}' is seed or rejected by filters", url);
47 |                 return;
48 |             }
49 | 
50 |             String id = (filtered == null) ? url : filtered;
51 | 
52 |             LOG.debug("Setting '{}' status to '{}'", id, status);
53 | 
54 | 
55 |             boolean create = status.equals(Status.DISCOVERED);
56 |             String published = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED);
57 |             if (published == null) {
58 |                 published = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED);
59 |             }
60 |             esUrlsOperations.upsertUrlStatus(id, published, source, create, status);
61 | 
62 |             if (status == Status.DISCOVERED) {
63 |                 eventCounter.scope("urls_discovered").incr();
64 |             }
65 |         } catch (Exception e) {
66 |             LOG.error("Failed to set status for url '{}'", url, e);
67 |         }
68 |     }
69 | 
70 | 
71 |     @Override
72 |     public void prepare(Map conf, TopologyContext context, OutputCollector outputCollector) {
73 |         super.prepare(conf, context, outputCollector);
74 |         this.eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10);
75 |         this.esUrlsOperations = this.serviceProvider.createEsHttpUrlOperations(conf);
76 |         this.esHttpSourcesOperations = this.serviceProvider.createEsHttpSourceOperations(conf);
77 |     }
78 | 
79 |     @Override
80 |     public void cleanup() {
81 |         super.cleanup();
82 |     }
83 | }


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/HttpSourceConfiguration.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.spout;
 2 | 
 3 | import com.google.common.collect.Iterables;
 4 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
 5 | import lt.tokenmill.crawling.crawler.utils.PrioritizedSource;
 6 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
 7 | import lt.tokenmill.crawling.data.HttpSource;
 8 | import lt.tokenmill.crawling.es.EsHttpSourcesCache;
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import java.util.Iterator;
13 | import java.util.List;
14 | import java.util.PriorityQueue;
15 | import java.util.stream.Collectors;
16 | 
17 | import static java.lang.System.currentTimeMillis;
18 | 
19 | public class HttpSourceConfiguration {
20 | 
21 |     private static final Logger LOG = LoggerFactory.getLogger(HttpSourceConfiguration.class);
22 | 
23 |     private final List<HttpSource> sources;
24 |     private final List<String> sourceUrls;
25 |     private final Iterator<String> sourceCycle;
26 |     private final PriorityQueue<PrioritizedSource> prioritizedSources;
27 | 
28 |     private static long lastReloadMillis = 0;
29 | 
30 |     private HttpSourceConfiguration(List<HttpSource> sources) {
31 |         this.sources = sources;
32 |         this.sourceUrls = sources.stream()
33 |                 .map(HttpSource::getUrl)
34 |                 .collect(Collectors.toList());
35 |         LOG.info("Loaded {} active HTTP sources", this.sourceUrls.size());
36 |         this.sourceCycle = Iterables.cycle(this.sourceUrls).iterator();
37 |         this.prioritizedSources =
38 |                 new PriorityQueue<>(new PrioritizedSource.PrioritizedUrlComparator());
39 |         sources.forEach(s -> {
40 |             s.getUrls().forEach(u -> prioritizedSources.offer(PrioritizedSource.createUrl(u, s)));
41 |             s.getFeeds().forEach(u -> prioritizedSources.offer(PrioritizedSource.createFeed(u, s)));
42 |             s.getSitemaps().forEach(u -> prioritizedSources.offer(PrioritizedSource.createSitemap(u, s)));
43 |         });
44 |     }
45 | 
46 |     public PrioritizedSource prioritized() {
47 |         PrioritizedSource prioritized = prioritizedSources.peek();
48 |         if (prioritized != null &&
49 |                 (prioritized.getNextFetchTime() <= currentTimeMillis())) {
50 |             prioritized = prioritizedSources.poll();
51 |             prioritized.recalculateNextFetchTime();
52 |             prioritizedSources.offer(prioritized);
53 |             return prioritized;
54 |         }
55 |         return null;
56 |     }
57 | 
58 |     public int maxTries() {
59 |         return Math.min(10, sourceUrls.size());
60 |     }
61 | 
62 |     public boolean hasNextActive() {
63 |         return sourceCycle.hasNext();
64 |     }
65 | 
66 | 
67 |     public String nextActive() {
68 |         return sourceCycle.next();
69 |     }
70 | 
71 |     public static HttpSourceConfiguration reload(HttpSourceConfiguration current, List<HttpSource> sources) {
72 |         HttpSourceConfiguration configuration;
73 |         if (current != null && current.sources.equals(sources)) {
74 |             LOG.info("HTTP source configuration didn't change. Using current version");
75 |             configuration = current;
76 |         } else {
77 |             configuration = new HttpSourceConfiguration(sources);
78 |             EsHttpSourcesCache.invalidate();
79 |             UrlFiltersCache.invalidate();
80 |         }
81 |         lastReloadMillis = currentTimeMillis();
82 |         return configuration;
83 |     }
84 | 
85 |     public static boolean needsReload() {
86 |         LOG.info("Checking reloading timeout. Remaining milliseconds: {}",
87 |                 lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY - currentTimeMillis());
88 |         return lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY < currentTimeMillis();
89 |     }
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/PrioritizedSource.java:
--------------------------------------------------------------------------------
  1 | package lt.tokenmill.crawling.crawler.utils;
  2 | 
  3 | 
  4 | import lt.tokenmill.crawling.data.HttpSource;
  5 | 
  6 | import java.io.Serializable;
  7 | import java.util.Comparator;
  8 | import java.util.concurrent.TimeUnit;
  9 | 
 10 | import static lt.tokenmill.crawling.crawler.CrawlerConstants.*;
 11 | 
 12 | public class PrioritizedSource implements Serializable {
 13 | 
 14 | 
 15 | 
 16 |     private final String url;
 17 | 
 18 |     private final HttpSource source;
 19 | 
 20 |     private Long delay = MIN_FETCH_DELAY;
 21 | 
 22 |     private boolean sitemap = false;
 23 |     private boolean feed = false;
 24 | 
 25 |     private Long nextFetchTime = System.currentTimeMillis();
 26 | 
 27 |     private PrioritizedSource(String url, HttpSource source) {
 28 |         this.url = url;
 29 |         this.source = source;
 30 |     }
 31 | 
 32 |     private void setDelay(Long delay) {
 33 |         this.delay = Math.max(delay, MIN_FETCH_DELAY);
 34 |     }
 35 | 
 36 |     private void setSitemap(boolean sitemap) {
 37 |         this.sitemap = sitemap;
 38 |     }
 39 | 
 40 |     private void setFeed(boolean feed) {
 41 |         this.feed = feed;
 42 |     }
 43 | 
 44 |     public void recalculateNextFetchTime() {
 45 |         nextFetchTime = System.currentTimeMillis() + delay;
 46 |     }
 47 | 
 48 |     public String getUrl() {
 49 |         return url;
 50 |     }
 51 | 
 52 |     public boolean isSitemap() {
 53 |         return sitemap;
 54 |     }
 55 | 
 56 |     public boolean isFeed() {
 57 |         return feed;
 58 |     }
 59 | 
 60 |     public HttpSource getSource() {
 61 |         return source;
 62 |     }
 63 | 
 64 |     public long getNextFetchTime() {
 65 |         return nextFetchTime;
 66 |     }
 67 | 
 68 |     public static class PrioritizedUrlComparator implements Comparator<PrioritizedSource>, Serializable {
 69 | 
 70 |         @Override
 71 |         public int compare(PrioritizedSource u1, PrioritizedSource u2) {
 72 |             return u1.nextFetchTime.compareTo(u2.nextFetchTime);
 73 |         }
 74 |     }
 75 | 
 76 |     public static PrioritizedSource createUrl(String url, HttpSource source) {
 77 |         PrioritizedSource result = new PrioritizedSource(url, source);
 78 |         long delay = source.getUrlRecrawlDelayInSecs() != null ?
 79 |                 TimeUnit.SECONDS.toMillis(source.getUrlRecrawlDelayInSecs()) : DEFAULT_URL_FETCH_DELAY;
 80 |         result.setDelay(delay);
 81 |         return result;
 82 |     }
 83 | 
 84 |     public static PrioritizedSource createFeed(String url, HttpSource source) {
 85 |         PrioritizedSource result = new PrioritizedSource(url, source);
 86 |         long delay = source.getFeedRecrawlDelayInSecs() != null ?
 87 |                 TimeUnit.SECONDS.toMillis(source.getFeedRecrawlDelayInSecs()) : DEFAULT_FEED_FETCH_DELAY;
 88 |         result.setDelay(delay);
 89 |         result.setFeed(true);
 90 |         return result;
 91 |     }
 92 | 
 93 |     public static PrioritizedSource createSitemap(String url, HttpSource source) {
 94 |         PrioritizedSource result = new PrioritizedSource(url, source);
 95 |         long delay = source.getSitemapRecrawlDelayInSecs() != null ?
 96 |                 TimeUnit.SECONDS.toMillis(source.getSitemapRecrawlDelayInSecs()) : DEFAULT_SITEMAP_FETCH_DELAY;
 97 |         result.setDelay(delay);
 98 |         result.setSitemap(true);
 99 |         return result;
100 |     }
101 | }


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFilterUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.utils;
 2 | 
 3 | import com.digitalpebble.stormcrawler.Metadata;
 4 | import com.digitalpebble.stormcrawler.filtering.URLFilters;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import java.io.IOException;
 9 | import java.net.URL;
10 | import java.util.Map;
11 | 
12 | public class UrlFilterUtils {
13 | 
14 |     private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtils.class);
15 | 
16 |     public static URLFilters load(Map conf, String filtersConfigFile) {
17 |         if (filtersConfigFile != null) {
18 |             try {
19 |                 URLFilters loaded = new URLFilters(conf, filtersConfigFile);
20 |                 LOG.info("Loaded URLFilters from '{}'", filtersConfigFile);
21 |                 return loaded;
22 |             } catch (IOException e) {
23 |                 LOG.error("Exception caught while loading the URLFilters");
24 |                 throw new RuntimeException("Exception caught while loading the URLFilters", e);
25 |             }
26 |         } else {
27 |             return URLFilters.emptyURLFilters;
28 |         }
29 |     }
30 | 
31 |     public static String firstMatch(URL sourceUrl, Metadata metadata, String targetUrl, URLFilters...filters) {
32 |         for (URLFilters filter : filters) {
33 |             String filtered = filter.filter(sourceUrl, metadata, targetUrl);
34 |             if (filtered != null) {
35 |                 return filtered;
36 |             }
37 |         }
38 |         return null;
39 |     }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFiltersCache.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.utils;
 2 | 
 3 | import com.google.common.cache.Cache;
 4 | import com.google.common.cache.CacheBuilder;
 5 | import lt.tokenmill.crawling.data.HttpSource;
 6 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
 7 | 
 8 | import java.util.concurrent.TimeUnit;
 9 | 
10 | public class UrlFiltersCache {
11 | 
12 |     private static final Cache<String, UrlFilters> CACHE;
13 | 
14 |     static {
15 |         CACHE = CacheBuilder.newBuilder()
16 |                 .expireAfterWrite(10, TimeUnit.MINUTES)
17 |                 .build();
18 |     }
19 | 
20 |     public static UrlFilters get(HttpSource source) {
21 |         UrlFilters filters = CACHE.getIfPresent(source.getUrl());
22 |         if (filters == null) {
23 |             filters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters());
24 |             CACHE.put(source.getUrl(), filters);
25 |         }
26 |         return filters;
27 |     }
28 | 
29 |     public static void invalidate() {
30 |         CACHE.invalidateAll();
31 |     }
32 | }


--------------------------------------------------------------------------------
/crawler/src/main/resources/urlfilters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "com.digitalpebble.stormcrawler.filtering.URLFilters": [
 3 |     {
 4 |       "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
 5 |       "name": "RegexURLFilter",
 6 |       "params": {
 7 |         "regexFilterFile": "urlfilters.txt"
 8 |       }
 9 |     }
10 |   ]
11 | }


--------------------------------------------------------------------------------
/crawler/src/main/resources/urlfilters.txt:
--------------------------------------------------------------------------------
 1 | #Discard URLs, longer than 512 chars
 2 | -.{512,}
 3 | 
 4 | #Discard urls which are actually links to other urls
 5 | -^https?://.*https?:.*
 6 | 
 7 | #Discard urls containing illegal characters: space, %20 or #
 8 | -.*(:?%20| |#|\@).*
 9 | 
10 | #Discard media or binary files
11 | -(?i).*\.(exe|dmg|csv|mp3|mp4|m4a|avi|mov|swf|wmv|dat|mpg|mpg4|flm|mtv|video|divx|mpeg4|film|xwmv|exo|pdf|jpg|jpeg|png|bmp|gif|doc|docx|xls|xlsx|ppt|pptx|rss)$
12 | 
13 | #Allow everything else
14 | +.


--------------------------------------------------------------------------------
/crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlFilterUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.spout;
 2 | 
 3 | import com.digitalpebble.stormcrawler.Metadata;
 4 | import com.digitalpebble.stormcrawler.filtering.URLFilters;
 5 | import com.digitalpebble.stormcrawler.util.ConfUtils;
 6 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
 7 | import lt.tokenmill.crawling.crawler.utils.UrlFilterUtils;
 8 | import org.junit.Test;
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import java.net.MalformedURLException;
13 | import java.net.URL;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 | 
17 | import static org.junit.Assert.assertNotNull;
18 | import static org.junit.Assert.assertNull;
19 | 
20 | public class UrlFilterUtilsTest {
21 | 
22 |     private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtilsTest.class);
23 |     private final String testSourceUrl = "http://www.tokenmill.lt/";
24 | 
25 |     @Test
26 |     public void testUrlFilters() {
27 |         Map conf = new HashMap();
28 |         conf.put(CrawlerConstants.URL_FILTERS_FILE, "urlfilters.json");
29 |         String filtersConfigFile = ConfUtils.getString(conf, CrawlerConstants.URL_FILTERS_FILE);
30 |         URLFilters filters = UrlFilterUtils.load(conf, filtersConfigFile);
31 |         URL sourceUrl;
32 |         try {
33 |             sourceUrl = new URL(testSourceUrl);
34 |         } catch (MalformedURLException e) {
35 |             // we would have known by now as previous components check whether the URL is valid
36 |             LOG.error("MalformedURLException on {}", testSourceUrl);
37 |             return;
38 |         }
39 |         // test good URL
40 |         assertNotNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl, filters));
41 |         // test on bad URL
42 |         assertNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl.concat("song.mp3"), filters));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpoutTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.crawler.spout;
 2 | 
 3 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider;
 4 | import org.junit.Test;
 5 | 
 6 | public class UrlGeneratorSpoutTest {
 7 | 
 8 | 
 9 |     @Test
10 |     public void test() {
11 |         UrlGeneratorSpout spout = new UrlGeneratorSpout(new DefaultServiceProvider());
12 |     }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/data-model/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>crawling-framework</artifactId>
 7 |         <groupId>lt.tokenmill.crawling</groupId>
 8 |         <version>0.3.4-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>data-model</artifactId>
13 | 
14 | 
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>joda-time</groupId>
18 |             <artifactId>joda-time</artifactId>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>com.google.guava</groupId>
22 |             <artifactId>guava</artifactId>
23 |         </dependency>
24 |         <dependency>
25 |             <groupId>junit</groupId>
26 |             <artifactId>junit</artifactId>
27 |             <version>4.13.1</version>
28 |             <scope>test</scope>
29 |         </dependency>
30 |     </dependencies>
31 | 
32 |     <profiles>
33 |         <profile>
34 |             <id>release</id>
35 |             <build>
36 |                 <plugins>
37 |                     <plugin>
38 |                         <groupId>org.apache.maven.plugins</groupId>
39 |                         <artifactId>maven-source-plugin</artifactId>
40 |                     </plugin>
41 | 
42 |                     <plugin>
43 |                         <groupId>org.apache.maven.plugins</groupId>
44 |                         <artifactId>maven-jar-plugin</artifactId>
45 |                     </plugin>
46 | 
47 |                     <plugin>
48 |                         <groupId>org.apache.maven.plugins</groupId>
49 |                         <artifactId>maven-javadoc-plugin</artifactId>
50 |                     </plugin>
51 |                 </plugins>
52 |             </build>
53 |         </profile>
54 |     </profiles>
55 | 
56 | </project>


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/DataUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import com.google.common.base.Splitter;
 4 | import org.joda.time.DateTime;
 5 | import org.joda.time.DateTimeZone;
 6 | import org.joda.time.format.DateTimeFormat;
 7 | import org.joda.time.format.DateTimeFormatter;
 8 | 
 9 | import java.io.Serializable;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 | 
13 | public class DataUtils implements Serializable {
14 | 
15 |     private static final DateTimeFormatter FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss");
16 | 
17 |     public static Integer tryParseInteger(Object object) {
18 |         try {
19 |             return (object != null) ? Integer.parseInt(object.toString()) : null;
20 |         } catch (NumberFormatException e) {
21 |         }
22 |         return null;
23 |     }
24 | 
25 |     public static Long tryParseLong(Object object) {
26 |         try {
27 |             return (object != null) ? Long.parseLong(object.toString()) : null;
28 |         } catch (NumberFormatException e) {
29 |         }
30 |         return null;
31 |     }
32 | 
33 |     public static List<String> parseStringList(Object object) {
34 |         if (object == null) {
35 |             return null;
36 |         }
37 |         return Splitter.onPattern("(?:\r?\n)+")
38 |                 .splitToList(object.toString())
39 |                 .stream()
40 |                 .map(String::trim)
41 |                 .filter(s -> !s.isEmpty())
42 |                 .collect(Collectors.toList());
43 |     }
44 | 
45 |     public static String formatInUTC(DateTime date) {
46 |         return date != null ? FORMATTER.print(date.toDateTime(DateTimeZone.UTC)) : null;
47 |     }
48 | 
49 |     public static DateTime parseFromUTC(String date) {
50 |         return date != null ? FORMATTER.parseDateTime(date) : null;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HighlightedSearchResult.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | public class HighlightedSearchResult implements Serializable {
 7 | 
 8 |     private HttpArticle article;
 9 | 
10 |     private List<String> highlights;
11 | 
12 |     public HighlightedSearchResult(HttpArticle article, List<String> highlights) {
13 |         this.article = article;
14 |         this.highlights = highlights;
15 |     }
16 | 
17 |     public HttpArticle getArticle() {
18 |         return article;
19 |     }
20 | 
21 |     public void setArticle(HttpArticle article) {
22 |         this.article = article;
23 |     }
24 | 
25 |     public List<String> getHighlights() {
26 |         return highlights;
27 |     }
28 | 
29 |     public void setHighlights(List<String> highlights) {
30 |         this.highlights = highlights;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HtmlAnalysisResult.java:
--------------------------------------------------------------------------------
  1 | package lt.tokenmill.crawling.data;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | 
  7 | public class HtmlAnalysisResult implements Serializable {
  8 | 
  9 |     private String url;
 10 |     private String title;
 11 |     private Integer httpStatus;
 12 |     private List<String> links;
 13 |     private Map<String, String> headers;
 14 |     private List<String> metaValues;
 15 | 
 16 |     private String robotsTxt;
 17 |     private Boolean robotsAllowedAll;
 18 |     private Boolean robotsAllowedNone;
 19 |     private Boolean robotsAllowedHome;
 20 |     private List<String> robotsSitemaps;
 21 |     private Long robotsCrawlDelay;
 22 | 
 23 |     public String getUrl() {
 24 |         return url;
 25 |     }
 26 | 
 27 |     public void setUrl(String url) {
 28 |         this.url = url;
 29 |     }
 30 | 
 31 |     public String getTitle() {
 32 |         return title;
 33 |     }
 34 | 
 35 |     public void setTitle(String title) {
 36 |         this.title = title;
 37 |     }
 38 | 
 39 |     public List<String> getLinks() {
 40 |         return links;
 41 |     }
 42 | 
 43 |     public void setLinks(List<String> links) {
 44 |         this.links = links;
 45 |     }
 46 | 
 47 |     public List<String> getMetaValues() {
 48 |         return metaValues;
 49 |     }
 50 | 
 51 |     public void setMetaValues(List<String> metaValues) {
 52 |         this.metaValues = metaValues;
 53 |     }
 54 | 
 55 |     public String getRobotsTxt() {
 56 |         return robotsTxt;
 57 |     }
 58 | 
 59 |     public void setRobotsTxt(String robotsTxt) {
 60 |         this.robotsTxt = robotsTxt;
 61 |     }
 62 | 
 63 |     public Boolean getRobotsAllowedAll() {
 64 |         return robotsAllowedAll;
 65 |     }
 66 | 
 67 |     public void setRobotsAllowedAll(Boolean robotsAllowedAll) {
 68 |         this.robotsAllowedAll = robotsAllowedAll;
 69 |     }
 70 | 
 71 |     public Boolean getRobotsAllowedNone() {
 72 |         return robotsAllowedNone;
 73 |     }
 74 | 
 75 |     public void setRobotsAllowedNone(Boolean robotsAllowedNone) {
 76 |         this.robotsAllowedNone = robotsAllowedNone;
 77 |     }
 78 | 
 79 |     public Boolean getRobotsAllowedHome() {
 80 |         return robotsAllowedHome;
 81 |     }
 82 | 
 83 |     public void setRobotsAllowedHome(Boolean robotsAllowedHome) {
 84 |         this.robotsAllowedHome = robotsAllowedHome;
 85 |     }
 86 | 
 87 |     public List<String> getRobotsSitemaps() {
 88 |         return robotsSitemaps;
 89 |     }
 90 | 
 91 |     public void setRobotsSitemaps(List<String> robotsSitemaps) {
 92 |         this.robotsSitemaps = robotsSitemaps;
 93 |     }
 94 | 
 95 |     public Long getRobotsCrawlDelay() {
 96 |         return robotsCrawlDelay;
 97 |     }
 98 | 
 99 |     public void setRobotsCrawlDelay(Long robotsCrawlDelay) {
100 |         this.robotsCrawlDelay = robotsCrawlDelay;
101 |     }
102 | 
103 |     public Integer getHttpStatus() {
104 |         return httpStatus;
105 |     }
106 | 
107 |     public void setHttpStatus(Integer httpStatus) {
108 |         this.httpStatus = httpStatus;
109 |     }
110 | 
111 |     public Map<String, String> getHeaders() {
112 |         return headers;
113 |     }
114 | 
115 |     public void setHeaders(Map<String, String> headers) {
116 |         this.headers = headers;
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticle.java:
--------------------------------------------------------------------------------
  1 | package lt.tokenmill.crawling.data;
  2 | 
  3 | 
  4 | import org.joda.time.DateTime;
  5 | 
  6 | import java.io.Serializable;
  7 | import java.util.List;
  8 | 
  9 | public class HttpArticle implements Serializable {
 10 | 
 11 |     private String source;
 12 | 
 13 |     private String language;
 14 | 
 15 |     private String url;
 16 | 
 17 |     private String title;
 18 | 
 19 |     private String text;
 20 | 
 21 |     private String textSignature;
 22 | 
 23 |     private List<String> appIds;
 24 | 
 25 |     private DateTime published;
 26 | 
 27 |     private DateTime discovered;
 28 | 
 29 |     private List<String> categories;
 30 | 
 31 |     public String getSource() {
 32 |         return source;
 33 |     }
 34 | 
 35 |     public String getUrl() {
 36 |         return url;
 37 |     }
 38 | 
 39 |     public void setSource(String source) {
 40 |         this.source = source;
 41 |     }
 42 | 
 43 |     public DateTime getPublished() {
 44 |         return published;
 45 |     }
 46 | 
 47 |     public void setPublished(DateTime published) {
 48 |         this.published = published;
 49 |     }
 50 | 
 51 |     public DateTime getDiscovered() {
 52 |         return discovered;
 53 |     }
 54 | 
 55 |     public void setDiscovered(DateTime discovered) {
 56 |         this.discovered = discovered;
 57 |     }
 58 | 
 59 |     public void setUrl(String url) {
 60 |         this.url = url;
 61 |     }
 62 | 
 63 |     public String getTitle() {
 64 |         return title;
 65 |     }
 66 | 
 67 |     public void setTitle(String title) {
 68 |         this.title = title;
 69 |     }
 70 | 
 71 |     public String getText() {
 72 |         return text;
 73 |     }
 74 | 
 75 |     public void setText(String text) {
 76 |         this.text = text;
 77 |     }
 78 | 
 79 |     public List<String> getAppIds() {
 80 |         return appIds;
 81 |     }
 82 | 
 83 |     public void setAppIds(List<String> appIds) {
 84 |         this.appIds = appIds;
 85 |     }
 86 | 
 87 |     public List<String> getCategories() {
 88 |         return categories;
 89 |     }
 90 | 
 91 |     public void setCategories(List<String> categories) {
 92 |         this.categories = categories;
 93 |     }
 94 | 
 95 |     public String getLanguage() {
 96 |         return language;
 97 |     }
 98 | 
 99 |     public void setLanguage(String language) {
100 |         this.language = language;
101 |     }
102 | 
103 |     public String getTextSignature() {
104 |         return textSignature;
105 |     }
106 | 
107 |     public void setTextSignature(String textSignature) {
108 |         this.textSignature = textSignature;
109 |     }
110 | 
111 |     @Override
112 |     public String toString() {
113 |         return "HttpArticle{" +
114 |                 "source='" + source + '\'' +
115 |                 ", language='" + language + '\'' +
116 |                 ", url='" + url + '\'' +
117 |                 ", title='" + title + '\'' +
118 |                 ", text='" + text + '\'' +
119 |                 ", textSignature='" + textSignature + '\'' +
120 |                 ", appIds=" + appIds +
121 |                 ", published=" + published +
122 |                 ", discovered=" + discovered +
123 |                 ", categories=" + categories +
124 |                 '}';
125 |     }
126 | }


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticleParseResult.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Collections;
 5 | import java.util.List;
 6 | 
 7 | public class HttpArticleParseResult implements Serializable {
 8 | 
 9 |     private HttpArticle article;
10 | 
11 |     private List<String> titleMatches;
12 | 
13 |     private List<String> textMatches;
14 | 
15 |     private List<String> publishedTexts;
16 | 
17 |     private List<String> publishedMatches;
18 | 
19 |     private String publishedPattern;
20 | 
21 |     public HttpArticleParseResult() {
22 |     }
23 | 
24 |     public HttpArticleParseResult(HttpArticle article) {
25 |         this.article = article;
26 |     }
27 | 
28 |     public HttpArticle getArticle() {
29 |         return article;
30 |     }
31 | 
32 |     public void setArticle(HttpArticle article) {
33 |         this.article = article;
34 |     }
35 | 
36 |     public List<String> getTitleMatches() {
37 |         return titleMatches != null ? titleMatches : Collections.emptyList();
38 |     }
39 | 
40 |     public void setTitleMatches(List<String> titleMatches) {
41 |         this.titleMatches = titleMatches;
42 |     }
43 | 
44 |     public List<String> getTextMatches() {
45 |         return textMatches != null ? textMatches : Collections.emptyList();
46 |     }
47 | 
48 |     public void setTextMatches(List<String> textMatches) {
49 |         this.textMatches = textMatches;
50 |     }
51 | 
52 |     public List<String> getPublishedTexts() {
53 |         return publishedTexts != null ? publishedTexts : Collections.emptyList();
54 |     }
55 | 
56 |     public void setPublishedTexts(List<String> publishedTexts) {
57 |         this.publishedTexts = publishedTexts;
58 |     }
59 | 
60 |     public List<String> getPublishedMatches() {
61 |         return publishedMatches != null ? publishedMatches : Collections.emptyList();
62 |     }
63 | 
64 |     public void setPublishedMatches(List<String> publishedMatches) {
65 |         this.publishedMatches = publishedMatches;
66 |     }
67 | 
68 |     public String getPublishedPattern() {
69 |         return publishedPattern;
70 |     }
71 | 
72 |     public void setPublishedPattern(String publishedPattern) {
73 |         this.publishedPattern = publishedPattern;
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpSourceTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class HttpSourceTest implements Serializable {
 6 | 
 7 |     private String source;
 8 | 
 9 |     private String url;
10 | 
11 |     private Boolean urlAccepted;
12 | 
13 |     private String html;
14 | 
15 |     private String title;
16 | 
17 |     private String text;
18 | 
19 |     private String date;
20 | 
21 |     public String getSource() {
22 |         return source;
23 |     }
24 | 
25 |     public void setSource(String source) {
26 |         this.source = source;
27 |     }
28 | 
29 |     public String getUrl() {
30 |         return url;
31 |     }
32 | 
33 |     public void setUrl(String url) {
34 |         this.url = url;
35 |     }
36 | 
37 |     public Boolean getUrlAccepted() {
38 |         return urlAccepted;
39 |     }
40 | 
41 |     public void setUrlAccepted(Boolean urlAccepted) {
42 |         this.urlAccepted = urlAccepted;
43 |     }
44 | 
45 |     public String getHtml() {
46 |         return html;
47 |     }
48 | 
49 |     public void setHtml(String html) {
50 |         this.html = html;
51 |     }
52 | 
53 |     public String getTitle() {
54 |         return title;
55 |     }
56 | 
57 |     public void setTitle(String title) {
58 |         this.title = title;
59 |     }
60 | 
61 |     public String getText() {
62 |         return text;
63 |     }
64 | 
65 |     public void setText(String text) {
66 |         this.text = text;
67 |     }
68 | 
69 |     public String getDate() {
70 |         return date;
71 |     }
72 | 
73 |     public void setDate(String date) {
74 |         this.date = date;
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpUrl.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import org.joda.time.DateTime;
 4 | 
 5 | import java.io.Serializable;
 6 | 
 7 | public class HttpUrl implements Serializable {
 8 | 
 9 |     /**
10 |      * Source url.
11 |      */
12 |     private String source;
13 | 
14 |     private String url;
15 | 
16 |     /**
17 |      * Publish data when it is known before parsing. Eg. from RSS feed.
18 |      */
19 |     private String published;
20 | 
21 |     /**
22 |      * When this url was dicovered.
23 |      */
24 |     private DateTime discovered;
25 | 
26 |     public String getSource() {
27 |         return source;
28 |     }
29 | 
30 |     public void setSource(String source) {
31 |         this.source = source;
32 |     }
33 | 
34 |     public String getUrl() {
35 |         return url;
36 |     }
37 | 
38 |     public void setUrl(String url) {
39 |         this.url = url;
40 |     }
41 | 
42 |     public String getPublished() {
43 |         return published;
44 |     }
45 | 
46 |     public void setPublished(String published) {
47 |         this.published = published;
48 |     }
49 | 
50 |     public DateTime getDiscovered() {
51 |         return discovered;
52 |     }
53 | 
54 |     public void setDiscovered(DateTime discovered) {
55 |         this.discovered = discovered;
56 |     }
57 | }


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/NamedQuery.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class NamedQuery implements Serializable {
 6 | 
 7 |     private String name;
 8 | 
 9 |     private String stemmedCaseSensitive;
10 |     private String stemmedCaseInSensitive;
11 |     private String notStemmedCaseSensitive;
12 |     private String notStemmedCaseInSensitive;
13 |     private String advanced;
14 | 
15 |     public String getName() {
16 |         return name;
17 |     }
18 | 
19 |     public void setName(String name) {
20 |         this.name = name;
21 |     }
22 | 
23 |     public String getStemmedCaseSensitive() {
24 |         return stemmedCaseSensitive;
25 |     }
26 | 
27 |     public void setStemmedCaseSensitive(String stemmedCaseSensitive) {
28 |         this.stemmedCaseSensitive = stemmedCaseSensitive;
29 |     }
30 | 
31 |     public String getStemmedCaseInSensitive() {
32 |         return stemmedCaseInSensitive;
33 |     }
34 | 
35 |     public void setStemmedCaseInSensitive(String stemmedCaseInSensitive) {
36 |         this.stemmedCaseInSensitive = stemmedCaseInSensitive;
37 |     }
38 | 
39 |     public String getNotStemmedCaseSensitive() {
40 |         return notStemmedCaseSensitive;
41 |     }
42 | 
43 |     public void setNotStemmedCaseSensitive(String notStemmedCaseSensitive) {
44 |         this.notStemmedCaseSensitive = notStemmedCaseSensitive;
45 |     }
46 | 
47 |     public String getNotStemmedCaseInSensitive() {
48 |         return notStemmedCaseInSensitive;
49 |     }
50 | 
51 |     public void setNotStemmedCaseInSensitive(String notStemmedCaseInSensitive) {
52 |         this.notStemmedCaseInSensitive = notStemmedCaseInSensitive;
53 |     }
54 | 
55 |     public String getAdvanced() {
56 |         return advanced;
57 |     }
58 | 
59 |     public void setAdvanced(String advanced) {
60 |         this.advanced = advanced;
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/PageableList.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | public class PageableList<T> implements Serializable {
 7 | 
 8 |     private long totalCount;
 9 | 
10 |     private List<T> items;
11 | 
12 |     public long getTotalCount() {
13 |         return totalCount;
14 |     }
15 | 
16 |     public void setTotalCount(long totalCount) {
17 |         this.totalCount = totalCount;
18 |     }
19 | 
20 |     public List<T> getItems() {
21 |         return items;
22 |     }
23 | 
24 |     public void setItems(List<T> items) {
25 |         this.items = items;
26 |     }
27 | 
28 |     public static <V> PageableList<V> create(List<V> items, long totalCount) {
29 |         PageableList<V> pageableList = new PageableList<>();
30 |         pageableList.setItems(items);
31 |         pageableList.setTotalCount(totalCount);
32 |         return pageableList;
33 |     }
34 | }


--------------------------------------------------------------------------------
/data-model/src/test/java/lt/tokenmill/crawling/data/DataUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.data;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import org.joda.time.DateTime;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | 
 9 | public class DataUtilsTest {
10 | 
11 |     @Test
12 |     public void normalizerSplitter() {
13 |         assertEquals(Lists.newArrayList("\\?.*$-->>", "a-->>b"),
14 |                 DataUtils.parseStringList("\\?.*$-->>\na-->>b\r\r\n\n"));
15 |     }
16 | 
17 |     @Test
18 |     public void dateFormatInUTC() {
19 |         Long DATE_2017_01_04_12_26_00 = 1483532760805L;
20 |         assertEquals("2017-01-04T12:26:00", DataUtils.formatInUTC(new DateTime(DATE_2017_01_04_12_26_00)));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   elasticsearch:
 5 |     image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
 6 |     ports: ["9200:9200"]
 7 |     environment:
 8 |       discovery.type: single-node
 9 |   kibana:
10 |     image: docker.elastic.co/kibana/kibana-oss:6.3.0
11 |     ports: ["5601:5601"]
12 |     environment:
13 |       SERVER_NAME: kibana
14 |       ELASTICSEARCH_URL: http://elasticsearch:9200
15 | 


--------------------------------------------------------------------------------
/docker-compose.run.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   elasticsearch:
 5 |     image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
 6 |     ports: ["9200:9200"]
 7 |     environment:
 8 |       discovery.type: single-node
 9 |   kibana:
10 |     image: docker.elastic.co/kibana/kibana-oss:6.3.0
11 |     ports: ["5601:5601"]
12 |     environment:
13 |       SERVER_NAME: kibana
14 |       ELASTICSEARCH_URL: http://elasticsearch:9200
15 |   administration-ui:
16 |     image: registry.gitlab.com/tokenmill/crawling-framework/ui:latest
17 |     ports: ["8081:8081"]
18 |   crawler:
19 |     image: registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
20 |     environment:
21 |       DEFAULT_SOURCE_RELOAD_DELAY: 10
22 | 


--------------------------------------------------------------------------------
/elasticsearch/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>crawling-framework</artifactId>
  7 |         <groupId>lt.tokenmill.crawling</groupId>
  8 |         <version>0.3.4-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <artifactId>elasticsearch</artifactId>
 13 | 
 14 | 
 15 |     <dependencies>
 16 |         <dependency>
 17 |             <groupId>lt.tokenmill.crawling</groupId>
 18 |             <artifactId>data-model</artifactId>
 19 |         </dependency>
 20 |         <dependency>
 21 |             <groupId>org.elasticsearch</groupId>
 22 |             <artifactId>elasticsearch</artifactId>
 23 |         </dependency>
 24 |         <dependency>
 25 |             <groupId>org.elasticsearch.client</groupId>
 26 |             <artifactId>transport</artifactId>
 27 |         </dependency>
 28 |         <dependency>
 29 |             <groupId>org.elasticsearch.client</groupId>
 30 |             <artifactId>elasticsearch-rest-client</artifactId>
 31 |             <version>${elasticsearch.version}</version>
 32 |         </dependency>
 33 |         <dependency>
 34 |             <groupId>org.elasticsearch.client</groupId>
 35 |             <artifactId>elasticsearch-rest-high-level-client</artifactId>
 36 |             <version>${elasticsearch.version}</version>
 37 |         </dependency>
 38 |         <dependency>
 39 |             <groupId>org.apache.httpcomponents</groupId>
 40 |             <artifactId>httpasyncclient</artifactId>
 41 |             <version>4.1.3</version>
 42 |         </dependency>
 43 |         <dependency>
 44 |             <groupId>org.apache.httpcomponents</groupId>
 45 |             <artifactId>httpcore-nio</artifactId>
 46 |             <version>4.4.6</version>
 47 |         </dependency>
 48 |         <dependency>
 49 |             <groupId>org.apache.httpcomponents</groupId>
 50 |             <artifactId>httpclient</artifactId>
 51 |             <version>4.5.4</version>
 52 |         </dependency>
 53 |         <dependency>
 54 |             <groupId>org.apache.httpcomponents</groupId>
 55 |             <artifactId>httpcore</artifactId>
 56 |             <version>4.4.6</version>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.elasticsearch.plugin</groupId>
 60 |             <artifactId>transport-netty4-client</artifactId>
 61 |             <version>${elasticsearch.version}</version>
 62 |             <scope>test</scope>
 63 |         </dependency>
 64 |         <dependency>
 65 |             <groupId>com.google.guava</groupId>
 66 |             <artifactId>guava</artifactId>
 67 |         </dependency>
 68 |         <dependency>
 69 |             <groupId>org.apache.logging.log4j</groupId>
 70 |             <artifactId>log4j-api</artifactId>
 71 |             <version>2.7</version>
 72 |             <scope>provided</scope>
 73 |         </dependency>
 74 |         <dependency>
 75 |             <groupId>org.apache.logging.log4j</groupId>
 76 |             <artifactId>log4j-core</artifactId>
 77 |             <version>2.13.2</version>
 78 |             <scope>provided</scope>
 79 |         </dependency>
 80 |         <dependency>
 81 |             <groupId>org.slf4j</groupId>
 82 |             <artifactId>slf4j-log4j12</artifactId>
 83 |             <version>${slf4j.version}</version>
 84 |             <scope>provided</scope>
 85 |         </dependency>
 86 |         <dependency>
 87 |             <groupId>junit</groupId>
 88 |             <artifactId>junit</artifactId>
 89 |             <version>4.13.1</version>
 90 |             <scope>test</scope>
 91 |         </dependency>
 92 |     </dependencies>
 93 | 
 94 |     <profiles>
 95 |         <profile>
 96 |             <id>release</id>
 97 |             <build>
 98 |                 <plugins>
 99 |                     <plugin>
100 |                         <groupId>org.apache.maven.plugins</groupId>
101 |                         <artifactId>maven-source-plugin</artifactId>
102 |                     </plugin>
103 | 
104 |                     <plugin>
105 |                         <groupId>org.apache.maven.plugins</groupId>
106 |                         <artifactId>maven-jar-plugin</artifactId>
107 |                     </plugin>
108 | 
109 |                     <plugin>
110 |                         <groupId>org.apache.maven.plugins</groupId>
111 |                         <artifactId>maven-javadoc-plugin</artifactId>
112 |                     </plugin>
113 |                 </plugins>
114 |             </build>
115 |         </profile>
116 |     </profiles>
117 | 
118 | </project>


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import org.elasticsearch.client.RequestOptions;
 4 | import org.slf4j.Logger;
 5 | import org.slf4j.LoggerFactory;
 6 | 
 7 | import java.net.URLEncoder;
 8 | import java.util.UUID;
 9 | 
10 | public class BaseElasticOps {
11 | 
12 |     protected final Logger LOG = LoggerFactory.getLogger(this.getClass());
13 | 
14 |     private final RequestOptions requestOptions;
15 |     private ElasticConnection connection;
16 |     private String index;
17 |     private String type;
18 | 
19 |     protected BaseElasticOps(ElasticConnection connection, String index, String type) {
20 |         this.connection = connection;
21 |         this.index = index;
22 |         this.type = type;
23 |         requestOptions = RequestOptions.DEFAULT;
24 |     }
25 | 
26 |     protected ElasticConnection getConnection() {
27 |         return connection;
28 |     }
29 | 
30 |     protected String getIndex() {
31 |         return index;
32 |     }
33 | 
34 |     protected String getType() {
35 |         return type;
36 |     }
37 | 
38 |     protected RequestOptions getRequestOptions() { return requestOptions; }
39 | 
40 |     public void close() {
41 |         if (connection != null) {
42 |             connection.close();
43 |         }
44 |     }
45 | 
46 |     protected static String formatId(String url) {
47 |         try {
48 |             String urlId = URLEncoder.encode(url.toLowerCase(), "utf-8");
49 |             if (urlId.length() > 511) {
50 |                 urlId = urlId.substring(0, 511);
51 |             }
52 |             return urlId;
53 |         } catch (Exception e) {
54 |             e.printStackTrace();
55 |         }
56 |         return UUID.randomUUID().toString();
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/ElasticConstants.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | public class ElasticConstants {
 4 | 
 5 |     public static final String ES_HOSTNAME_PARAM = "es.hostname";
 6 |     public static final String ES_REST_PORT = "es.rest.port";
 7 |     public static final String ES_REST_SCHEME = "es.rest.scheme";
 8 | 
 9 |     public static final String ES_URLS_INDEX_NAME_PARAM = "es.urls.index.name";
10 |     public static final String ES_URLS_DOC_TYPE_PARAM = "es.urls.doc.type";
11 | 
12 |     public static final String ES_DOCS_INDEX_NAME_PARAM = "es.docs.index.name";
13 |     public static final String ES_DOCS_DOC_TYPE_PARAM = "es.docs.doc.type";
14 | 
15 |     public static final String ES_HTTP_SOURCES_INDEX_NAME_PARAM = "es.httpsource.index.name";
16 |     public static final String ES_HTTP_SOURCES_DOC_TYPE_PARAM = "es.httpsource.doc.type";
17 | 
18 |     public static final String ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM = "es.httpsourcetest.index.name";
19 |     public static final String ES_HTTP_SOURCES_TEST_TYPE_PARAM = "es.httpsourcetest.doc.type";
20 | 
21 |     public static final String ES_NAMED_QUERIES_INDEX_PARAM = "es.namedqueries.index.name";
22 |     public static final String ES_NAMED_QUERIES_TYPE_PARAM = "es.namedqueries.doc.type";
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDataParser.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import org.joda.time.DateTime;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import java.text.ParseException;
 9 | import java.text.SimpleDateFormat;
10 | import java.util.List;
11 | import java.util.TimeZone;
12 | 
13 | class EsDataParser {
14 | 
15 |     private static final Logger LOG = LoggerFactory.getLogger(EsDataParser.class);
16 | 
17 | 
18 |     private static final List<String> ES_DATE_TIME_FORMATS = Lists.newArrayList(
19 |             "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'",
20 |             "yyyy-MM-dd'T'HH:mm:ss'Z'"
21 |     );
22 | 
23 |     static DateTime nullOrDate(Object object) {
24 |         if (object != null) {
25 |             DateTime result = null;
26 |             for (String format : ES_DATE_TIME_FORMATS) {
27 |                 SimpleDateFormat formatter = new SimpleDateFormat(format);
28 |                 formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
29 |                 try {
30 |                     result = new DateTime(formatter.parse(object.toString()));
31 |                     break;
32 |                 } catch (ParseException ignored) {
33 |                 }
34 |             }
35 |             if (result == null) {
36 |                 LOG.error("Failed to parse date from '{}'", object);
37 |             }
38 |             return result;
39 |         }
40 |         return null;
41 |     }
42 | 
43 |     static boolean falseOrBoolean(Object object) {
44 |         return (object != null) && Boolean.parseBoolean(object.toString());
45 |     }
46 | }


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourcesCache.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import com.google.common.cache.CacheBuilder;
 4 | import com.google.common.cache.CacheLoader;
 5 | import com.google.common.cache.LoadingCache;
 6 | import lt.tokenmill.crawling.data.HttpSource;
 7 | 
 8 | import java.util.concurrent.ExecutionException;
 9 | import java.util.concurrent.TimeUnit;
10 | 
11 | public class EsHttpSourcesCache {
12 | 
13 | 
14 |     private static LoadingCache<String, HttpSource> INSTANCE;
15 | 
16 |     private static synchronized LoadingCache<String, HttpSource> getInstance(
17 |             final EsHttpSourceOperations operations) {
18 |         if (INSTANCE == null) {
19 |             INSTANCE = CacheBuilder.newBuilder()
20 |                     .maximumSize(1000)
21 |                     .expireAfterWrite(10, TimeUnit.MINUTES)
22 |                     .build(new CacheLoader<String, HttpSource>() {
23 |                         public HttpSource load(String url) {
24 |                             return operations.get(url);
25 |                         }
26 |                     });
27 |         }
28 |         return INSTANCE;
29 |     }
30 | 
31 |     public static HttpSource get(EsHttpSourceOperations operations, String source) {
32 |         try {
33 |             return getInstance(operations).get(source);
34 |         } catch (ExecutionException e) {
35 |             throw new RuntimeException(e);
36 |         }
37 |     }
38 | 
39 |     public static void invalidate() {
40 |         if (INSTANCE != null) {
41 |             INSTANCE.invalidateAll();
42 |         }
43 |     }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/Utils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import com.google.common.base.Joiner;
 4 | import com.google.common.base.Splitter;
 5 | import org.joda.time.DateTime;
 6 | 
 7 | import java.util.Collection;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Objects;
11 | import java.util.stream.Collectors;
12 | 
13 | import static com.google.common.base.Strings.isNullOrEmpty;
14 | 
15 | public class Utils {
16 | 
17 |     private static final Splitter LINE_SPLITTER = Splitter.on('\n');
18 |     private static final Joiner LINE_JOINER = Joiner.on('\n');
19 | 
20 |     public static List<String> linesToList(String text) {
21 |         return LINE_SPLITTER.splitToList(text).stream()
22 |                 .map(String::trim)
23 |                 .filter(l -> !isNullOrEmpty(l))
24 |                 .collect(Collectors.toList());
25 |     }
26 | 
27 |     public static String listToText(List lines) {
28 |         return lines != null ? LINE_JOINER.join(lines) : "";
29 |     }
30 | 
31 |     public static Object formatFieldValue(Object value) {
32 |         if (value == null) {
33 |             return null;
34 |         }
35 |         if (value instanceof List) {
36 |             List v = (List) value;
37 |             if (!v.isEmpty() && (v.get(0) instanceof Map)) {
38 |                 return v;
39 |             }
40 |             return listToText((List) value);
41 |         } else if (value instanceof DateTime) {
42 |             return ((DateTime) value).toDate();
43 |         } else if (value instanceof Enum) {
44 |             return Objects.toString(value, null);
45 |         } else {
46 |             return value;
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/model/DateHistogramValue.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es.model;
 2 | 
 3 | public class DateHistogramValue {
 4 | 
 5 |     private Long value;
 6 | 
 7 |     private String date;
 8 | 
 9 |     public DateHistogramValue(String date, Long value) {
10 |         this.value = value;
11 |         this.date = date;
12 |     }
13 | 
14 |     public Long getValue() {
15 |         return value;
16 |     }
17 | 
18 |     public void setValue(Long value) {
19 |         this.value = value;
20 |     }
21 | 
22 |     public String getDate() {
23 |         return date;
24 |     }
25 | 
26 |     public void setDate(String date) {
27 |         this.date = date;
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/document.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "number_of_shards": 1,
  4 |     "number_of_replicas": 0,
  5 |     "index": {
  6 |       "codec": "best_compression"
  7 |     },
  8 |     "analysis": {
  9 |       "filter": {
 10 |         "english_stop": {
 11 |           "type":       "stop",
 12 |           "stopwords":  "_english_"
 13 |         },
 14 |         "english_light_stemmer": {
 15 |           "type":       "stemmer",
 16 |           "language":   "light_english"
 17 |         },
 18 |         "english_possessive_stemmer": {
 19 |           "type":       "stemmer",
 20 |           "language":   "possessive_english"
 21 |         }
 22 |       },
 23 |       "analyzer": {
 24 |         "english_stem_cs": {
 25 |           "tokenizer":  "standard",
 26 |           "filter": [
 27 |             "english_possessive_stemmer",
 28 |             "english_stop",
 29 |             "english_light_stemmer"
 30 |           ]
 31 |         },
 32 |         "english_stem_ci": {
 33 |           "tokenizer":  "standard",
 34 |           "filter": [
 35 |             "english_possessive_stemmer",
 36 |             "lowercase",
 37 |             "english_stop",
 38 |             "english_light_stemmer"
 39 |           ]
 40 |         },
 41 |         "english_nostem_cs": {
 42 |           "tokenizer":  "standard",
 43 |           "filter": [
 44 |             "english_possessive_stemmer",
 45 |             "english_stop"
 46 |           ]
 47 |         },
 48 |         "english_nostem_ci": {
 49 |           "tokenizer":  "standard",
 50 |           "filter": [
 51 |             "english_possessive_stemmer",
 52 |             "lowercase",
 53 |             "english_stop"
 54 |           ]
 55 |         }
 56 |       }
 57 |     }
 58 |   },
 59 |   "mappings": {
 60 |     "doc": {
 61 |       "_source": {
 62 |         "enabled": true
 63 |       },
 64 |       "properties": {
 65 |         "created": {
 66 |           "type": "date"
 67 |         },
 68 |         "published": {
 69 |           "type": "date"
 70 |         },
 71 |         "discovered": {
 72 |           "type": "date"
 73 |         },
 74 |         "updated": {
 75 |           "type": "date"
 76 |         },
 77 |         "url": {
 78 |           "type": "keyword"
 79 |         },
 80 |         "source": {
 81 |           "type": "keyword"
 82 |         },
 83 |         "language": {
 84 |           "type": "keyword"
 85 |         },
 86 |         "status": {
 87 |           "type": "keyword"
 88 |         },
 89 |         "app_ids": {
 90 |           "type": "keyword"
 91 |         },
 92 |         "categories": {
 93 |           "type": "keyword"
 94 |         },
 95 |         "title": {
 96 |           "type": "text",
 97 |           "index": true,
 98 |           "doc_values": false,
 99 |           "fielddata": true,
100 |           "fields": {
101 |             "stem_cs": {
102 |               "type": "text",
103 |               "index": true,
104 |               "analyzer": "english_stem_cs"
105 |             },
106 |             "stem_ci": {
107 |               "type": "text",
108 |               "index": true,
109 |               "analyzer": "english_stem_ci"
110 |             },
111 |             "nostem_cs": {
112 |               "type": "text",
113 |               "index": true,
114 |               "analyzer": "english_nostem_cs"
115 |             },
116 |             "nostem_ci": {
117 |               "type": "text",
118 |               "index": true,
119 |               "analyzer": "english_nostem_ci"
120 |             }
121 |           }
122 |         },
123 |         "text": {
124 |           "type": "text",
125 |           "doc_values": false,
126 |           "fielddata": true,
127 |           "fields": {
128 |             "stem_cs": {
129 |               "type": "text",
130 |               "index": true,
131 |               "analyzer": "english_stem_cs"
132 |             },
133 |             "stem_ci": {
134 |               "type": "text",
135 |               "index": true,
136 |               "analyzer": "english_stem_ci"
137 |             },
138 |             "nostem_cs": {
139 |               "type": "text",
140 |               "index": true,
141 |               "analyzer": "english_nostem_cs"
142 |             },
143 |             "nostem_ci": {
144 |               "type": "text",
145 |               "index": true,
146 |               "analyzer": "english_nostem_ci"
147 |             }
148 |           }
149 |         },
150 |         "text_signature": {
151 |           "type": "keyword"
152 |         },
153 |         "duplicate_of": {
154 |           "type": "keyword"
155 |         }
156 |       }
157 |     }
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/http_source.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "number_of_shards": 1,
  4 |     "number_of_replicas": 0,
  5 |     "index": {
  6 |       "codec": "best_compression"
  7 |     }
  8 |   },
  9 |   "mappings": {
 10 |     "http_source": {
 11 |       "_source": {
 12 |         "enabled": true
 13 |       },
 14 |       "properties": {
 15 |         "created": {
 16 |           "type": "date",
 17 |           "format": "date_optional_time"
 18 |         },
 19 |         "updated": {
 20 |           "type": "date",
 21 |           "format": "date_optional_time"
 22 |         },
 23 |         "url": {
 24 |           "type": "keyword",
 25 |           "copy_to": "search_field"
 26 |         },
 27 |         "name": {
 28 |           "type": "keyword",
 29 |           "copy_to": "search_field"
 30 |         },
 31 |         "timezone": {
 32 |           "type": "keyword"
 33 |         },
 34 |         "language": {
 35 |           "type": "keyword"
 36 |         },
 37 |         "url_crawl_delay_secs": {
 38 |           "type": "integer"
 39 |         },
 40 |         "feed_crawl_delay_secs": {
 41 |           "type": "integer"
 42 |         },
 43 |         "sitemap_crawl_delay_secs": {
 44 |           "type": "integer"
 45 |         },
 46 |         "enabled": {
 47 |           "type": "boolean"
 48 |         },
 49 |         "discovery_enabled": {
 50 |           "type": "boolean"
 51 |         },
 52 |         "urls": {
 53 |           "type": "keyword",
 54 |           "copy_to": "search_field"
 55 |         },
 56 |         "sitemaps": {
 57 |           "type": "keyword",
 58 |           "copy_to": "search_field"
 59 |         },
 60 |         "feeds": {
 61 |           "type": "keyword",
 62 |           "copy_to": "search_field"
 63 |         },
 64 |         "countries": {
 65 |           "type": "keyword"
 66 |         },
 67 |         "categories": {
 68 |           "type": "keyword"
 69 |         },
 70 |         "app_ids": {
 71 |           "type": "keyword"
 72 |         },
 73 |         "url_filters": {
 74 |           "type": "keyword"
 75 |         },
 76 |         "url_normalizers": {
 77 |           "type": "keyword"
 78 |         },
 79 |         "title_selectors": {
 80 |           "type": "keyword"
 81 |         },
 82 |         "date_selectors": {
 83 |           "type": "keyword"
 84 |         },
 85 |         "text_selectors": {
 86 |           "type": "keyword"
 87 |         },
 88 |         "text_normalizers": {
 89 |           "type": "keyword"
 90 |         },
 91 |         "date_regexps": {
 92 |           "type": "keyword"
 93 |         },
 94 |         "date_formats": {
 95 |           "type": "keyword"
 96 |         },
 97 |         "search_field": {
 98 |           "type": "text"
 99 |         }
100 |       }
101 |     }
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/http_source_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "number_of_shards": 1,
 4 |     "number_of_replicas": 0,
 5 |     "index": {
 6 |       "codec": "best_compression"
 7 |     }
 8 |   },
 9 |   "mappings": {
10 |     "http_source_test": {
11 |       "_source": {
12 |         "enabled": true
13 |       },
14 |       "properties": {
15 |         "updated": {
16 |           "type": "date",
17 |           "format": "date_optional_time"
18 |         },
19 |         "source_url": {
20 |           "type": "keyword",
21 |           "copy_to": "search_field"
22 |         },
23 |         "url": {
24 |           "type": "keyword",
25 |           "copy_to": "search_field"
26 |         },
27 |         "url_accepted": {
28 |           "type": "boolean",
29 |           "doc_values": false
30 |         },
31 |         "html": {
32 |           "type": "keyword",
33 |           "index": false,
34 |           "doc_values": false
35 |         },
36 |         "title": {
37 |           "type": "keyword",
38 |           "index": false,
39 |           "doc_values": false
40 |         },
41 |         "text": {
42 |           "type": "keyword",
43 |           "index": false,
44 |           "doc_values": false
45 |         },
46 |         "date": {
47 |           "type": "keyword",
48 |           "index": false,
49 |           "doc_values": false
50 |         },
51 |         "search_field": {
52 |           "type": "text"
53 |         }
54 |       }
55 |     }
56 |   }
57 | }


--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "number_of_shards": 1,
 4 |     "number_of_replicas": 0,
 5 |     "index": {
 6 |       "codec": "best_compression"
 7 |     }
 8 |   },
 9 |   "mappings": {
10 |     "named_query": {
11 |       "_source": {
12 |         "enabled": true
13 |       },
14 |       "properties": {
15 |         "updated": {
16 |           "type": "date",
17 |           "format": "date_optional_time"
18 |         },
19 |         "name": {
20 |           "type": "keyword"
21 |         },
22 |         "name_suggest": {
23 |           "type": "completion"
24 |         },
25 |         "stemmed_case_sensitive": {
26 |           "type": "keyword"
27 |         },
28 |         "stemmed_case_insensitive": {
29 |           "type": "keyword"
30 |         },
31 |         "not_stemmed_case_sensitive": {
32 |           "type": "keyword"
33 |         },
34 |         "not_stemmed_case_insensitive": {
35 |           "type": "keyword"
36 |         },
37 |         "advanced": {
38 |           "type": "keyword"
39 |         }
40 |       }
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/url.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "number_of_shards": 1,
 4 |     "number_of_replicas": 0,
 5 |     "index": {
 6 |       "codec": "best_compression"
 7 |     }
 8 |   },
 9 |   "mappings": {
10 |     "url": {
11 |       "_source": {
12 |         "enabled": true
13 |       },
14 |       "properties": {
15 |         "created": {
16 |           "type": "date"
17 |         },
18 |         "updated": {
19 |           "type": "date"
20 |         },
21 |         "published": {
22 |           "type": "date"
23 |         },
24 |         "url": {
25 |           "type": "keyword"
26 |         },
27 |         "source": {
28 |           "type": "keyword"
29 |         },
30 |         "status": {
31 |           "type": "keyword"
32 |         }
33 |       }
34 |     }
35 |   }
36 | }


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticConnectionTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import org.apache.http.HttpHost;
 4 | import org.elasticsearch.action.DocWriteRequest;
 5 | import org.elasticsearch.action.bulk.BulkItemResponse;
 6 | import org.elasticsearch.action.bulk.BulkProcessor;
 7 | import org.elasticsearch.action.bulk.BulkRequest;
 8 | import org.elasticsearch.action.bulk.BulkResponse;
 9 | import org.elasticsearch.action.index.IndexRequest;
10 | import org.elasticsearch.action.update.UpdateRequest;
11 | import org.elasticsearch.client.RestClient;
12 | import org.elasticsearch.client.RestClientBuilder;
13 | import org.elasticsearch.client.RestHighLevelClient;
14 | import org.elasticsearch.common.unit.TimeValue;
15 | import org.junit.Test;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 | 
19 | import java.io.UnsupportedEncodingException;
20 | import java.net.URLDecoder;
21 | 
22 | import static org.junit.Assert.assertNotNull;
23 | 
24 | public class ElasticConnectionTest {
25 |     private static final Logger LOG = LoggerFactory.getLogger(ElasticConnectionTest.class);
26 |     @Test
27 |     public void testConnectionBuilder() {
28 |         ElasticConnection connection = ElasticConnection.builder().build();
29 |         assertNotNull(connection.getRestHighLevelClient());
30 |     }
31 | 
32 |     @Test
33 |     public void testBuilder() {
34 |         BulkProcessor.Listener listener = new BulkProcessor.Listener() {
35 | 
36 |             @Override
37 |             public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
38 |                 for (BulkItemResponse item : response.getItems()) {
39 |                     if (item.isFailed()) {
40 |                         LOG.error("Bulk item failure: '{}' for request '{}'",
41 |                                 item.getFailure(), request.requests().get(item.getItemId()));
42 |                     }
43 |                 }
44 |             }
45 | 
46 |             @Override
47 |             public void afterBulk(long executionId, BulkRequest request, Throwable response) {
48 |                 LOG.error("Bulk failed:" + response);
49 |             }
50 | 
51 |             @Override
52 |             public void beforeBulk(long executionId, BulkRequest request) {
53 |                 for (DocWriteRequest r :request.requests()) {
54 |                     try {
55 |                         if (r instanceof IndexRequest) {
56 |                             IndexRequest indexRequest = (IndexRequest) r;
57 |                             indexRequest.id(URLDecoder.decode(indexRequest.id(), "utf-8"));
58 | 
59 |                         } else if (r instanceof UpdateRequest) {
60 |                             UpdateRequest updateRequest = (UpdateRequest) r;
61 |                             updateRequest.id(URLDecoder.decode(updateRequest.id(), "utf-8"));
62 |                         }
63 |                     } catch (UnsupportedEncodingException e) {
64 |                         e.printStackTrace();
65 |                     }
66 |                 }
67 |             }
68 |         };
69 |         ElasticConnection connection = ElasticConnection.builder()
70 |                 .hostname("0.0.0.0")
71 |                 .restPort(443)
72 |                 .restScheme("https")
73 |                 .bulkActions(1)
74 |                 .flushIntervalString("1s")
75 |                 .listener(listener)
76 |                 .build();
77 |         assertNotNull(connection);
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticsearchTestServer.java:
--------------------------------------------------------------------------------
  1 | package lt.tokenmill.crawling.es;
  2 | 
  3 | import org.elasticsearch.client.Client;
  4 | import org.elasticsearch.common.settings.Settings;
  5 | import org.elasticsearch.env.Environment;
  6 | import org.elasticsearch.node.Node;
  7 | import org.elasticsearch.node.NodeValidationException;
  8 | import org.elasticsearch.plugins.Plugin;
  9 | import org.elasticsearch.transport.Netty4Plugin;
 10 | 
 11 | import java.io.File;
 12 | import java.io.IOException;
 13 | import java.nio.file.FileVisitOption;
 14 | import java.nio.file.Files;
 15 | import java.nio.file.Path;
 16 | import java.nio.file.Paths;
 17 | import java.util.Arrays;
 18 | import java.util.Collection;
 19 | import java.util.Comparator;
 20 | 
 21 | public class ElasticsearchTestServer {
 22 | 
 23 |     private static class MyNode extends Node {
 24 |         MyNode(Settings preparedSettings, Collection<Class<? extends Plugin>> classpathPlugins) {
 25 |             super(new Environment(preparedSettings, null), classpathPlugins, false);
 26 |         }
 27 |     }
 28 | 
 29 |     private final Node node;
 30 |     private Client client;
 31 | 
 32 |     private ElasticsearchTestServer(Builder builder) {
 33 |         if (builder.cleanDataDir) {
 34 |             try {
 35 |                 Path rootPath = Paths.get(builder.dataDirectory);
 36 |                 if (Files.exists(rootPath)) {
 37 |                     Files.walk(rootPath, FileVisitOption.FOLLOW_LINKS)
 38 |                             .sorted(Comparator.reverseOrder())
 39 |                             .map(Path::toFile)
 40 |                             .forEach(File::delete);
 41 |                 }
 42 |             } catch (IOException e) {
 43 |                 e.printStackTrace();
 44 |             }
 45 |         }
 46 |         Settings settings = Settings.builder()
 47 |                 .put("client.transport.ignore_cluster_name", true)
 48 |                 .put("transport.type", "netty4")
 49 |                 .put("http.type", "netty4")
 50 |                 .put("http.enabled", "true")
 51 |                 .put("http.port", builder.httpPort)
 52 |                 .put("path.home", builder.dataDirectory)
 53 |                 .put("transport.tcp.port", builder.transportPort)
 54 |                 .build();
 55 |         this.node = new MyNode(settings, Arrays.asList(Netty4Plugin.class));
 56 |     }
 57 | 
 58 |     public void start() {
 59 |         try {
 60 |             this.node.start();
 61 |             this.client = this.node.client();
 62 |         } catch (NodeValidationException e) {
 63 |             e.printStackTrace();
 64 |         }
 65 |     }
 66 | 
 67 |     public void stop() {
 68 |         try {
 69 |             this.client.close();
 70 |             this.node.close();
 71 |         } catch (IOException e) {
 72 |             e.printStackTrace();
 73 |         }
 74 |     }
 75 | 
 76 |     public static Builder builder() {
 77 |         return new Builder();
 78 |     }
 79 | 
 80 | 
 81 |     public static class Builder {
 82 | 
 83 |         private boolean cleanDataDir = true;
 84 |         private String dataDirectory = "target/elasticsearch-data";
 85 |         private int httpPort = 9200;
 86 |         private int transportPort = 9305;
 87 | 
 88 |         public Builder httpPort(int httpPort) {
 89 |             this.httpPort = httpPort;
 90 |             return this;
 91 |         }
 92 | 
 93 |         public Builder transportPort(int transportPort) {
 94 |             this.transportPort = transportPort;
 95 |             return this;
 96 |         }
 97 | 
 98 |         public ElasticsearchTestServer build() {
 99 |             return new ElasticsearchTestServer(this);
100 |         }
101 | 
102 | 
103 |         public Builder dataDirectory(String dataDirectory) {
104 |             this.dataDirectory = dataDirectory;
105 |             return this;
106 |         }
107 | 
108 |         public Builder cleanDataDir(boolean cleanDataDir) {
109 |             this.cleanDataDir = cleanDataDir;
110 |             return this;
111 |         }
112 |     }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsDocumentOperationsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import com.google.common.collect.ImmutableMap;
 4 | import lt.tokenmill.crawling.data.HttpArticle;
 5 | import org.joda.time.DateTime;
 6 | import org.junit.Ignore;
 7 | import org.junit.Test;
 8 | 
 9 | import java.util.Arrays;
10 | import java.util.Map;
11 | 
12 | import static org.junit.Assert.assertEquals;
13 | import static org.junit.Assert.assertNull;
14 | 
15 | public class EsDocumentOperationsTest {
16 | 
17 |     @Test
18 |     @Ignore
19 |     public void test() throws InterruptedException {
20 |         ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
21 |         EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "demo-docs", "doc");
22 |         HttpArticle article = new HttpArticle();
23 |         article.setUrl("http://www.bbc.com/news/science-environment-43727547");
24 |         article.setTitle("title");
25 |         article.setText("text");
26 |         article.setPublished(DateTime.now());
27 | 
28 |         esDocumentOperations.store(article);
29 | 
30 |         Thread.sleep(6000);
31 | 
32 |         HttpArticle httpArticle = esDocumentOperations.get(article.getUrl());
33 |         assertEquals(article.getUrl(), httpArticle.getUrl());
34 |         assertEquals(article.getText(), httpArticle.getText());
35 | 
36 |         esDocumentOperations.update(article, ImmutableMap.of("TESTKEY", Arrays.asList(ImmutableMap.of("k1", "v1"))));
37 |         Thread.sleep(6000);
38 |         Map<String, Object> articleMap = esDocumentOperations.getAsMap(article.getUrl());
39 |         assertEquals(article.getText(), articleMap.get("text"));
40 |         assertEquals("TESTVAL", articleMap.get("TESTKEY"));
41 |     }
42 | 
43 |     @Test
44 |     @Ignore
45 |     public void testDuplicateFinder() throws InterruptedException {
46 |         ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
47 |         EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "cf-docs", "doc");
48 |         HttpArticle article = new HttpArticle();
49 |         article.setUrl("url1");
50 |         article.setSource("source");
51 |         article.setTitle("title");
52 |         article.setText("text");
53 |         article.setTextSignature("text_signature");
54 |         article.setPublished(DateTime.now());
55 |         esDocumentOperations.store(article);
56 |         Thread.sleep(6000);
57 |         HttpArticle duplicate = esDocumentOperations.findDuplicate(article);
58 |         assertNull(duplicate);
59 |         article.setUrl("url2");
60 |         esDocumentOperations.store(article);
61 |         Thread.sleep(6000);
62 |         assertEquals("url1", esDocumentOperations.getAsMap("url2").get("duplicate_of"));
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpSourceOperationsTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpSource;
 4 | import lt.tokenmill.crawling.data.PageableList;
 5 | import org.junit.Ignore;
 6 | import org.junit.Test;
 7 | 
 8 | import static org.junit.Assert.*;
 9 | 
10 | public class EsHttpSourceOperationsTest {
11 | 
12 |     @Test
13 |     @Ignore
14 |     public void test() {
15 |         ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
16 |         EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "demo-http_sources", "http_source");
17 |         PageableList<HttpSource> data = esHttpSourceOperations.filter(null);
18 |         for (HttpSource source : data.getItems()) {
19 |             System.out.println(">>" + source);
20 |         }
21 |     }
22 | 
23 |     @Ignore
24 |     @Test
25 |     public void testRefresh() {
26 |         ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
27 |         EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "cf-http_sources", "http_source");
28 |         HttpSource source = new HttpSource();
29 |         source.setName("test");
30 |         source.setUrl("url");
31 |         esHttpSourceOperations.save(source);
32 |         String currentName = esHttpSourceOperations.get("url").getName();
33 |         assertEquals("test", currentName);
34 |         source.setName("new name");
35 |         esHttpSourceOperations.save(source);
36 |         String name = esHttpSourceOperations.get("url").getName();
37 |         assertNotEquals("test", name);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpUrlOperationsTestInt.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpUrl;
 4 | import org.junit.Test;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import java.io.IOException;
 9 | import java.util.List;
10 | 
11 | import static junit.framework.TestCase.assertTrue;
12 | 
13 | public class EsHttpUrlOperationsTestInt {
14 | 
15 |     private static final Logger LOG = LoggerFactory.getLogger(EsHttpUrlOperationsTestInt.class);
16 | 
17 |     private static final String ES_TEST_HOST = "elasticsearch";
18 |     private static final int ES_HTTP_TEST_PORT = 9200;
19 |     private static final String ES_REST_TEST_SCHEME = "http";
20 |     private static final String INDEX_ALIAS = "urls";
21 |     private static final String DOC_TYPE = "url";
22 | 
23 | 
24 |     @Test
25 |     public void testEsHttpSourceOperations000() throws IOException, InterruptedException {
26 |         ElasticConnection connection = ElasticConnection.getConnection(ES_TEST_HOST, ES_HTTP_TEST_PORT, ES_REST_TEST_SCHEME);
27 |         EsHttpUrlOperations esHttpUrlOperations = EsHttpUrlOperations.getInstance(connection, INDEX_ALIAS, DOC_TYPE);
28 | 
29 |         String url = "http://www.bbc.com/news/science-environment-43727547";
30 |         String source = "www.bbc.com";
31 |         esHttpUrlOperations.upsertUrlStatus(url, null, source, true, "a");
32 |         Thread.sleep(6000);
33 |         esHttpUrlOperations.upsertUrlStatus(url, null, source, false, "b");
34 |         Thread.sleep(6000);
35 |         List<HttpUrl> urls = esHttpUrlOperations.findUrlsByStatusAndSource("b", source, 10);
36 |         assertTrue(urls.size() > 0);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/TestUtils.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.es;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.URISyntaxException;
 5 | import java.nio.charset.StandardCharsets;
 6 | import java.nio.file.Files;
 7 | import java.nio.file.Paths;
 8 | 
 9 | public class TestUtils {
10 | 
11 |     public static byte[] readResourceAsBytes(String filename) throws URISyntaxException, IOException {
12 |         return Files.readAllBytes(Paths.get(TestUtils.class.getClassLoader().getResource(filename).toURI()));
13 |     }
14 | 
15 |     public static String readResourceAsString(String filename) throws URISyntaxException, IOException {
16 |         return new String(readResourceAsBytes(filename), StandardCharsets.UTF_8);
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | 
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Target=System.out
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{5}:%L - %m%n
7 | 


--------------------------------------------------------------------------------
/elasticsearch/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | name=PropertiesConfig
 2 | property.filename = logs
 3 | appenders = console
 4 | appender.console.type = Console
 5 | appender.console.name = STDOUT
 6 | appender.console.layout.type = PatternLayout
 7 | appender.console.layout.pattern = [%-5level] %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %c{5} - %msg%n
 8 | 
 9 | rootLogger.level = WARN
10 | rootLogger.appenderRefs = stdout
11 | rootLogger.appenderRef.stdout.ref = STDOUT
12 | 
13 | appender.org.elasticsearch = debug


--------------------------------------------------------------------------------
/page-analyzer/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>crawling-framework</artifactId>
 7 |         <groupId>lt.tokenmill.crawling</groupId>
 8 |         <version>0.3.4-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>page-analyzer</artifactId>
13 | 
14 |     <dependencies>
15 |         <dependency>
16 |             <groupId>lt.tokenmill.crawling</groupId>
17 |             <artifactId>data-model</artifactId>
18 |             <version>${project.version}</version>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>org.jsoup</groupId>
22 |             <artifactId>jsoup</artifactId>
23 |         </dependency>
24 |         <dependency>
25 |             <groupId>com.google.guava</groupId>
26 |             <artifactId>guava</artifactId>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>com.mashape.unirest</groupId>
30 |             <artifactId>unirest-java</artifactId>
31 |             <version>1.4.9</version>
32 |         </dependency>
33 |         <dependency>
34 |             <groupId>com.github.crawler-commons</groupId>
35 |             <artifactId>crawler-commons</artifactId>
36 |             <version>0.7</version>
37 |         </dependency>
38 |         <dependency>
39 |             <groupId>org.slf4j</groupId>
40 |             <artifactId>slf4j-log4j12</artifactId>
41 |             <version>${slf4j.version}</version>
42 |             <scope>provided</scope>
43 |         </dependency>
44 |         <dependency>
45 |             <groupId>junit</groupId>
46 |             <artifactId>junit</artifactId>
47 |             <version>4.13.1</version>
48 |             <scope>test</scope>
49 |         </dependency>
50 |     </dependencies>
51 | 
52 |     <profiles>
53 |         <profile>
54 |             <id>release</id>
55 |             <build>
56 |                 <plugins>
57 |                     <plugin>
58 |                         <groupId>org.apache.maven.plugins</groupId>
59 |                         <artifactId>maven-source-plugin</artifactId>
60 |                     </plugin>
61 | 
62 |                     <plugin>
63 |                         <groupId>org.apache.maven.plugins</groupId>
64 |                         <artifactId>maven-jar-plugin</artifactId>
65 |                     </plugin>
66 | 
67 |                     <plugin>
68 |                         <groupId>org.apache.maven.plugins</groupId>
69 |                         <artifactId>maven-javadoc-plugin</artifactId>
70 |                     </plugin>
71 |                 </plugins>
72 |             </build>
73 |         </profile>
74 |     </profiles>
75 | 
76 | </project>


--------------------------------------------------------------------------------
/page-analyzer/src/main/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.pageanalyzer;
 2 | 
 3 | import com.google.common.base.Joiner;
 4 | import com.google.common.collect.Maps;
 5 | import com.mashape.unirest.http.HttpResponse;
 6 | import com.mashape.unirest.http.Unirest;
 7 | import com.mashape.unirest.http.exceptions.UnirestException;
 8 | import crawlercommons.robots.BaseRobotRules;
 9 | import crawlercommons.robots.SimpleRobotRulesParser;
10 | import lt.tokenmill.crawling.data.HtmlAnalysisResult;
11 | import org.jsoup.Jsoup;
12 | import org.jsoup.nodes.Document;
13 | import org.jsoup.nodes.Element;
14 | 
15 | import java.net.URL;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.stream.Collectors;
19 | 
20 | public class PageAnalyzer {
21 | 
22 |     private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
23 | 
24 |     public static final String CONFIG_USER_AGENT = "UserAgent";
25 |     public static final String CONFIG_ANALYZE_ROBOTS_TXT = "RobotsTxt";
26 | 
27 |     public static HtmlAnalysisResult analyze(Map<String, String> config, String url) {
28 |         try {
29 |             String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT);
30 |             HttpResponse<String> response = Unirest.get(url)
31 |                     .header("User-Agent", userAgent)
32 |                     .asString();
33 |             return analyze(config, url, response.getBody(), response.getStatus(), response.getHeaders());
34 |         } catch (UnirestException e) {
35 |             throw new RuntimeException(e);
36 |         }
37 |     }
38 | 
39 |     public static HtmlAnalysisResult analyze(Map<String, String> config, String url, String html) {
40 |         return analyze(config, url, html, null, Maps.newHashMap());
41 |     }
42 | 
43 |     public static HtmlAnalysisResult analyze(Map<String, String> config, String url, String html, Integer status, Map<String, List<String>> headers) {
44 |         try {
45 |             HtmlAnalysisResult result = new HtmlAnalysisResult();
46 |             result.setUrl(url);
47 |             result.setHttpStatus(status);
48 |             result.setHeaders(headers.entrySet()
49 |                     .stream()
50 |                     .collect(Collectors.toMap(Map.Entry::getKey, e -> Joiner.on("\n").join(e.getValue()))));
51 | 
52 |             Document document = Jsoup.parse(html, url);
53 |             result.setTitle(document.title());
54 | 
55 |             List<String> meta = document.select("meta").stream().map(Element::toString).collect(Collectors.toList());
56 |             result.setMetaValues(meta);
57 | 
58 |             List<String> links = document.select("a").stream().map(e -> e.attr("abs:href")).collect(Collectors.toList());
59 |             result.setLinks(links);
60 | 
61 |             if (Boolean.parseBoolean(config.get(CONFIG_ANALYZE_ROBOTS_TXT))) {
62 |                 String robotsUrl = robotsTxtUrl(url);
63 |                 String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT);
64 |                 HttpResponse<String> response = Unirest.get(robotsUrl)
65 |                         .header("User-Agent", userAgent)
66 |                         .asString();
67 |                 String robotsTxt = response.getBody();
68 |                 parseRobotsTxt(userAgent, robotsUrl, robotsTxt, result);
69 |             }
70 |             return result;
71 |         } catch (Exception e) {
72 |             throw new RuntimeException(e);
73 |         }
74 |     }
75 | 
76 |     public static void parseRobotsTxt(String userAgent, String robotsUrl, String robotsTxt, HtmlAnalysisResult result) {
77 |         result.setRobotsTxt(robotsTxt);
78 |         SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
79 |         BaseRobotRules robotRules = robotsParser.parseContent(robotsUrl, robotsTxt.getBytes(), null, userAgent);
80 |         result.setRobotsAllowedAll(robotRules.isAllowAll());
81 |         result.setRobotsAllowedNone(robotRules.isAllowNone());
82 |         result.setRobotsAllowedHome(robotRules.isAllowed("/"));
83 |         result.setRobotsSitemaps(robotRules.getSitemaps());
84 |         result.setRobotsCrawlDelay(robotRules.getCrawlDelay());
85 |     }
86 | 
87 |     private static String robotsTxtUrl(String url) {
88 |         try {
89 |             URL urlObject = new URL(url);
90 |             String portPart = urlObject.getPort() > 0 ? ":" + urlObject.getPort() : "";
91 |             return String.format("%s://%s%s/robots.txt", urlObject.getProtocol(),
92 |                     urlObject.getHost(), portPart);
93 |         } catch (Exception e) {
94 |             throw new RuntimeException(e);
95 |         }
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/page-analyzer/src/test/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.pageanalyzer;
 2 | 
 3 | import com.google.common.base.Charsets;
 4 | import com.google.common.collect.Lists;
 5 | import com.google.common.collect.Maps;
 6 | import com.google.common.io.Resources;
 7 | import lt.tokenmill.crawling.data.HtmlAnalysisResult;
 8 | import org.junit.Ignore;
 9 | import org.junit.Test;
10 | 
11 | import java.net.URL;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | 
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.assertFalse;
18 | import static org.junit.Assert.assertTrue;
19 | 
20 | public class PageAnalyzerTest {
21 | 
22 |     @Test
23 |     public void headersAndStatus() {
24 |         Map<String, List<String>> headers = Maps.newHashMap();
25 |         headers.put("Etag", Lists.newArrayList("c1dc8d7be85325149", "ed5fc4d62b84752"));
26 |         headers.put("Date", Lists.newArrayList("Wed, 11 Jan 2017 13:00:18 GMT"));
27 |         HashMap<String, String> config = Maps.newHashMap();
28 |         HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://example.org", "<html></html>", 200, headers);
29 | 
30 |         assertEquals(new Integer(200), result.getHttpStatus());
31 |         assertEquals(2, result.getHeaders().size());
32 |         assertEquals("c1dc8d7be85325149\ned5fc4d62b84752", result.getHeaders().get("Etag"));
33 |         assertEquals("Wed, 11 Jan 2017 13:00:18 GMT", result.getHeaders().get("Date"));
34 |     }
35 | 
36 | 
37 |     @Test
38 |     public void htmlParsing() {
39 |         HashMap<String, String> config = Maps.newHashMap();
40 |         HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://bloomberg.com/", loadHtml("bloomberg.com"), 200, Maps.newHashMap());
41 |         assertEquals("Bloomberg.com", result.getTitle());
42 |         assertEquals(33, result.getMetaValues().size());
43 |         assertTrue(result.getMetaValues().contains("<meta property=\"og:url\" content=\"https://www.bloomberg.com/\" data-ephemeral=\"true\">"));
44 |         assertEquals(361, result.getLinks().size());
45 |         assertTrue(result.getLinks().contains("https://www.bloomberg.com/news/articles/2017-01-10/netanyahu-s-grip-on-power-under-threat-as-gift-scandal-escalates"));
46 |     }
47 | 
48 |     @Test
49 |     @Ignore
50 |     public void fetchAndParse() {
51 |         HashMap<String, String> config = Maps.newHashMap();
52 |         config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true");
53 |         HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://www.tokenmill.lt/");
54 |         assertEquals("TokenMill - Natural Language Processing", result.getTitle());
55 |         assertEquals(10, result.getMetaValues().size());
56 |         assertEquals(42, result.getLinks().size());
57 |         assertTrue(result.getLinks().contains("http://www.tokenmill.lt/#case-monitoring"));
58 |         assertTrue(result.getRobotsAllowedAll());
59 |         assertFalse(result.getRobotsAllowedNone());
60 |         assertTrue(result.getRobotsAllowedHome());
61 |         assertEquals(Lists.newArrayList(), result.getRobotsSitemaps());
62 |         assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay());
63 | 
64 |     }
65 | 
66 |     @Test
67 |     @Ignore
68 |     public void fetchAndParseRobotsTxt() {
69 |         HashMap<String, String> config = Maps.newHashMap();
70 |         config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true");
71 |         HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://www.google.com");
72 |         assertFalse(result.getRobotsAllowedAll());
73 |         assertFalse(result.getRobotsAllowedNone());
74 |         assertTrue(result.getRobotsAllowedHome());
75 |         assertTrue(result.getRobotsSitemaps().contains("http://www.gstatic.com/culturalinstitute/sitemaps/www_google_com_culturalinstitute/sitemap-index.xml"));
76 |         assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay());
77 | 
78 |     }
79 | 
80 |     private static String loadHtml(String name) {
81 |         try {
82 |             URL htmlResource = Resources.getResource(name + ".html");
83 |             return Resources.toString(htmlResource, Charsets.UTF_8);
84 |         } catch (Exception e) {
85 |             throw new RuntimeException(e);
86 |         }
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/parser/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>crawling-framework</artifactId>
 7 |         <groupId>lt.tokenmill.crawling</groupId>
 8 |         <version>0.3.4-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>parser</artifactId>
13 | 
14 |     <dependencies>
15 |         <dependency>
16 |             <groupId>lt.tokenmill.crawling</groupId>
17 |             <artifactId>data-model</artifactId>
18 |             <version>${project.version}</version>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>org.jsoup</groupId>
22 |             <artifactId>jsoup</artifactId>
23 |         </dependency>
24 |         <dependency>
25 |             <groupId>com.github.jsonld-java</groupId>
26 |             <artifactId>jsonld-java</artifactId>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>com.google.guava</groupId>
30 |             <artifactId>guava</artifactId>
31 |         </dependency>
32 |         <dependency>
33 |             <groupId>org.apache.commons</groupId>
34 |             <artifactId>commons-lang3</artifactId>
35 |             <version>3.5</version>
36 |         </dependency>
37 |         <dependency>
38 |             <groupId>org.clojure</groupId>
39 |             <artifactId>clojure</artifactId>
40 |             <version>1.7.0</version>
41 |         </dependency>
42 |         <dependency>
43 |             <groupId>lt.tokenmill</groupId>
44 |             <artifactId>timewords</artifactId>
45 |             <version>${timewords.version}</version>
46 |         </dependency>
47 |         <dependency>
48 |             <groupId>org.slf4j</groupId>
49 |             <artifactId>slf4j-log4j12</artifactId>
50 |             <version>${slf4j.version}</version>
51 |             <scope>provided</scope>
52 |         </dependency>
53 |         <dependency>
54 |             <groupId>junit</groupId>
55 |             <artifactId>junit</artifactId>
56 |             <version>4.13.1</version>
57 |             <scope>test</scope>
58 |         </dependency>
59 |     </dependencies>
60 | 
61 |     <profiles>
62 |         <profile>
63 |             <id>release</id>
64 |             <build>
65 |                 <plugins>
66 |                     <plugin>
67 |                         <groupId>org.apache.maven.plugins</groupId>
68 |                         <artifactId>maven-source-plugin</artifactId>
69 |                     </plugin>
70 | 
71 |                     <plugin>
72 |                         <groupId>org.apache.maven.plugins</groupId>
73 |                         <artifactId>maven-jar-plugin</artifactId>
74 |                     </plugin>
75 | 
76 |                     <plugin>
77 |                         <groupId>org.apache.maven.plugins</groupId>
78 |                         <artifactId>maven-javadoc-plugin</artifactId>
79 |                     </plugin>
80 |                 </plugins>
81 |             </build>
82 |         </profile>
83 |     </profiles>
84 | 
85 | </project>


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/PageAnalyzer.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 | 
3 | public class PageAnalyzer {
4 | 
5 | 
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/TitleParser.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | 
 4 | import com.google.common.base.Strings;
 5 | import com.google.common.collect.Lists;
 6 | import com.google.common.collect.Maps;
 7 | import lt.tokenmill.crawling.parser.data.MatchedString;
 8 | import org.jsoup.nodes.Document;
 9 | 
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.stream.Collectors;
13 | 
14 | public class TitleParser {
15 | 
16 |     private static final List<String> TITLE_META_KEYS = Lists.newArrayList("og:title");
17 | 
18 |     public static List<MatchedString> extractFromMeta(Document document) {
19 |         String itempropValue = document.select("[itemprop*=headline]").text();
20 |         if (itempropValue != null && !itempropValue.trim().isEmpty()) {
21 |             return Lists.newArrayList(new MatchedString(itempropValue, "[itemprop*=headline]"));
22 |         }
23 |         Map<String, String> metaValues = Maps.newHashMap();
24 |         document.select("meta").forEach(m -> {
25 |             String name = m.attr("name");
26 |             String property = m.attr("property");
27 |             String content = m.attr("content");
28 |             if (!Strings.isNullOrEmpty(name)) {
29 |                 metaValues.put(name.toLowerCase(), content);
30 |             } else if (!Strings.isNullOrEmpty(property)) {
31 |                 metaValues.put(property.toLowerCase(), content);
32 |             }
33 |         });
34 |         return TITLE_META_KEYS.stream()
35 |                 .filter(k -> metaValues.get(k) != null)
36 |                 .map(k -> new MatchedString(metaValues.get(k), "META:" + k))
37 |                 .collect(Collectors.toList());
38 |     }
39 | }


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedDate.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.data;
 2 | 
 3 | import org.joda.time.DateTime;
 4 | 
 5 | import java.util.Objects;
 6 | 
 7 | public class MatchedDate {
 8 | 
 9 |     private String value;
10 | 
11 |     private String match;
12 | 
13 |     private String pattern;
14 | 
15 |     private DateTime date;
16 | 
17 |     public MatchedDate(String value, String match) {
18 |         this.value = value;
19 |         this.match = match;
20 |     }
21 | 
22 |     public String getValue() {
23 |         return value;
24 |     }
25 | 
26 |     public void setValue(String value) {
27 |         this.value = value;
28 |     }
29 | 
30 |     public String getMatch() {
31 |         return match;
32 |     }
33 | 
34 |     public void setMatch(String match) {
35 |         this.match = match;
36 |     }
37 | 
38 |     public DateTime getDate() {
39 |         return date;
40 |     }
41 | 
42 |     public void setDate(DateTime date) {
43 |         this.date = date;
44 |     }
45 | 
46 |     public String getPattern() {
47 |         return pattern;
48 |     }
49 | 
50 |     public void setPattern(String pattern) {
51 |         this.pattern = pattern;
52 |     }
53 | 
54 |     @Override
55 |     public String toString() {
56 |         return "MatchedDate{" +
57 |                 "value='" + value + '\'' +
58 |                 ", match='" + match + '\'' +
59 |                 ", pattern='" + pattern + '\'' +
60 |                 ", date=" + date +
61 |                 '}';
62 |     }
63 | 
64 |     @Override
65 |     public boolean equals(Object o) {
66 |         if (this == o) return true;
67 |         if (o == null || getClass() != o.getClass()) return false;
68 |         MatchedDate that = (MatchedDate) o;
69 |         return Objects.equals(value, that.value);
70 |     }
71 | 
72 |     @Override
73 |     public int hashCode() {
74 |         return Objects.hash(value);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedString.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.data;
 2 | 
 3 | import java.util.Objects;
 4 | 
 5 | public class MatchedString {
 6 | 
 7 |     private String value;
 8 | 
 9 |     private String match;
10 | 
11 |     public MatchedString(String value, String match) {
12 |         this.value = value;
13 |         this.match = match;
14 |     }
15 | 
16 |     public String getValue() {
17 |         return value;
18 |     }
19 | 
20 |     public void setValue(String value) {
21 |         this.value = value;
22 |     }
23 | 
24 |     public String getMatch() {
25 |         return match;
26 |     }
27 | 
28 |     public void setMatch(String match) {
29 |         this.match = match;
30 |     }
31 | 
32 |     @Override
33 |     public boolean equals(Object o) {
34 |         if (this == o) return true;
35 |         if (o == null || getClass() != o.getClass()) return false;
36 |         MatchedString that = (MatchedString) o;
37 |         return Objects.equals(value, that.value);
38 |     }
39 | 
40 |     @Override
41 |     public int hashCode() {
42 |         return Objects.hash(value);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/urls/UrlExtractor.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.urls;
 2 | 
 3 | import org.jsoup.nodes.Document;
 4 | import org.jsoup.select.Elements;
 5 | 
 6 | import java.net.URI;
 7 | import java.net.URISyntaxException;
 8 | import java.util.HashSet;
 9 | import java.util.Set;
10 | import java.util.stream.Collectors;
11 | 
12 | public class UrlExtractor {
13 | 
14 |     private static boolean isAbsolute(String url) {
15 |         try {
16 |             URI uri = new URI(url);
17 |             return uri.isAbsolute();
18 |         } catch (URISyntaxException e) {
19 |             e.printStackTrace();
20 |             return false;
21 |         }
22 |     }
23 | 
24 |     private static Set<String> extract(Document document) {
25 |         Set<String> canonicalUrls = new HashSet<>();
26 |         if (document == null) {
27 |             return canonicalUrls;
28 |         }
29 | 
30 |         Elements elements = document.select("meta[property=og:url]");
31 |         elements.forEach(element -> {
32 |             String attr = element.attr("content");
33 |             if (attr != null) {
34 |                 canonicalUrls.add(attr);
35 |             }
36 |         });
37 | 
38 |         elements = document.select("link[rel=canonical]");
39 |         elements.forEach(element -> {
40 |             String attr = element.attr("href");
41 |             if (attr != null) {
42 |                 canonicalUrls.add(attr);
43 |             }
44 |         });
45 | 
46 |         return canonicalUrls.stream()
47 |                 .filter(UrlExtractor::isAbsolute)
48 |                 .collect(Collectors.toSet());
49 |     }
50 | 
51 |     public static String extract(String url, Document document) {
52 |         Set<String> canonicalUrls = extract(document);
53 |         if (canonicalUrls == null) {
54 |             return url;
55 |         } else {
56 |             return canonicalUrls.stream().findFirst().orElse(url);
57 |         }
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/HttpSourceTester.java:
--------------------------------------------------------------------------------
  1 | package lt.tokenmill.crawling.parser.utils;
  2 | 
  3 | import com.google.common.collect.Maps;
  4 | import lt.tokenmill.crawling.data.*;
  5 | import lt.tokenmill.crawling.parser.ArticleExtractor;
  6 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
  7 | 
  8 | import java.util.Map;
  9 | 
 10 | import static com.google.common.base.Strings.nullToEmpty;
 11 | 
 12 | public class HttpSourceTester {
 13 | 
 14 |     public static final String URL_ACCEPTED = "url_accepted";
 15 |     public static final String TITLE = "title";
 16 |     public static final String TEXT = "text";
 17 |     public static final String DATE = "date";
 18 | 
 19 |     public static Map<String, Difference> test(HttpSource source, HttpSourceTest data) {
 20 |         TestResult result = new TestResult();
 21 | 
 22 |         String url = data.getUrl();
 23 |         UrlFilters urlFilters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters());
 24 |         UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url);
 25 |         result.acceptedUrl(filteringResult.getAccepted(), data.getUrlAccepted());
 26 | 
 27 |         String html = nullToEmpty(data.getHtml()).trim();
 28 |         HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, source, null);
 29 |         HttpArticle article = parseResult.getArticle();
 30 |         result.title(nullToEmpty(article.getTitle()), nullToEmpty(data.getTitle()));
 31 |         result.text(nullToEmpty(article.getText()), nullToEmpty(data.getText()));
 32 |         result.date(article.getPublished() != null ? DataUtils.formatInUTC(article.getPublished()) : "", nullToEmpty(data.getDate()));
 33 | 
 34 |         return result.difference();
 35 |     }
 36 | 
 37 |     public static class Difference {
 38 | 
 39 |         private String actual;
 40 | 
 41 |         private String expected;
 42 | 
 43 |         public Difference(String actual, String expected) {
 44 |             this.actual = actual;
 45 |             this.expected = expected;
 46 |         }
 47 | 
 48 |         public String getActual() {
 49 |             return actual;
 50 |         }
 51 | 
 52 |         public String getExpected() {
 53 |             return expected;
 54 |         }
 55 | 
 56 |         @Override
 57 |         public String toString() {
 58 |             return "Difference{" +
 59 |                     "actual='" + actual + '\'' +
 60 |                     ", expected='" + expected + '\'' +
 61 |                     '}';
 62 |         }
 63 |     }
 64 | 
 65 |     public static class TestResult {
 66 | 
 67 |         private boolean expectedUrlAccepted;
 68 |         private boolean actualUrlAccepted;
 69 |         private String expectedTitle;
 70 |         private String actualTitle;
 71 |         private String expectedText;
 72 |         private String actualText;
 73 |         private String expectedDate;
 74 |         private String actualDate;
 75 | 
 76 |         void acceptedUrl(boolean actual, boolean expected) {
 77 |             this.expectedUrlAccepted = expected;
 78 |             this.actualUrlAccepted = actual;
 79 |         }
 80 | 
 81 |         public void title(String actual, String expected) {
 82 |             this.expectedTitle = expected.trim();
 83 |             this.actualTitle = actual.trim();
 84 |         }
 85 | 
 86 |         public void text(String actual, String expected) {
 87 |             this.expectedText = expected.trim();
 88 |             this.actualText = actual.trim();
 89 |         }
 90 | 
 91 |         public void date(String actual, String expected) {
 92 |             this.expectedDate = expected.trim();
 93 |             this.actualDate = actual.trim();
 94 |         }
 95 | 
 96 |         public Map<String, Difference> difference() {
 97 |             Map<String, Difference> result = Maps.newLinkedHashMap();
 98 |             if (expectedUrlAccepted != actualUrlAccepted) {
 99 |                 result.put(URL_ACCEPTED,
100 |                         new Difference(String.valueOf(actualUrlAccepted), String.valueOf(expectedUrlAccepted)));
101 |             }
102 |             if (!expectedTitle.equals(actualTitle)) {
103 |                 result.put(TITLE, new Difference(actualTitle, expectedTitle));
104 |             }
105 |             if (!expectedText.equals(actualText)) {
106 |                 result.put(TEXT, new Difference(actualText, expectedText));
107 |             }
108 |             if (!expectedDate.equals(actualDate)) {
109 |                 result.put(DATE, new Difference(actualDate, expectedDate));
110 |             }
111 |             return result;
112 |         }
113 |     }
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/QueryParser.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.utils;
 2 | 
 3 | import com.google.common.base.Strings;
 4 | import com.google.common.collect.Lists;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | import java.util.stream.Collectors;
 9 | 
10 | public class QueryParser {
11 | 
12 |     public static List<String> parseQuery(String query) {
13 |         List<String> result = Lists.newArrayList();
14 |         if (!Strings.isNullOrEmpty(query)) {
15 |             query = query.replaceAll("(\\s*[+-]\\s*)", "#SPLIT#$1");
16 |             return Arrays.stream(query.split("(#SPLIT#| )"))
17 |                     .map(String::trim)
18 |                     .filter(s -> !s.isEmpty())
19 |                     .collect(Collectors.toList());
20 |         }
21 |         return result;
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/TextFilters.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.utils;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Objects;
 5 | import java.util.regex.Pattern;
 6 | 
 7 | public class TextFilters {
 8 | 
 9 |     // Normalizer is of format [match regexp]-->>[replacement string]
10 |     // Normalizers that don't match the format are ignored
11 |     // [match regexp]s that don't compile are ignored
12 |     // String t can be null.
13 |     // if textNormalizers is null then t is returned.
14 |     public static String normalizeText(String t, List<String> textNormalizers) {
15 |         t = Objects.toString(t, "");
16 |         if (textNormalizers == null)
17 |             return t;
18 |         return textNormalizers.stream()
19 |                 .filter(tn -> tn.contains("-->>"))
20 |                 .reduce(t, (a, tn) -> {
21 |                     String[] parts = tn.split("-->>");
22 |                     String match = parts[0];
23 |                     try {
24 |                         Pattern.compile(match);
25 |                     } catch (Exception e) {
26 |                         return a;
27 |                     }
28 |                     String replacement = parts.length > 1 ? parts[1] : "";
29 |                     return a.replaceAll(match, replacement);
30 |                 }).trim();
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/AljazeeraExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpArticle;
 4 | import lt.tokenmill.crawling.data.HttpSource;
 5 | import org.junit.Test;
 6 | 
 7 | import java.util.Arrays;
 8 | 
 9 | import static junit.framework.TestCase.assertEquals;
10 | 
11 | public class AljazeeraExtractorTest extends BaseArticleExtractorTest {
12 | 
13 |     @Test
14 |     public void testFortune2() throws Exception {
15 |         String html = loadArticle("aljazeera1");
16 |         String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
17 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, getSourceConf(), null);
18 |         assertEquals("2018-05-13T00:00:00.000Z", article.getPublished().toInstant().toString());
19 |     }
20 | 
21 |     private HttpSource getSourceConf() {
22 |         HttpSource source = new HttpSource();
23 |         source.setDateSelectors(Arrays.asList(".article-duration"));
24 |         return source;
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/BaseArticleExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import com.google.common.base.Charsets;
 4 | import com.google.common.io.Resources;
 5 | 
 6 | import java.net.URL;
 7 | 
 8 | public abstract class BaseArticleExtractorTest {
 9 | 
10 |     protected String loadArticle(String name) throws Exception {
11 |         URL htmlResource = Resources.getResource("articles/" + name + ".html");
12 |         return Resources.toString(htmlResource, Charsets.UTF_8);
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/BloombergExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpArticle;
 4 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
 5 | import lt.tokenmill.crawling.data.HttpSource;
 6 | import org.joda.time.DateTime;
 7 | import org.joda.time.DateTimeZone;
 8 | import org.junit.Test;
 9 | 
10 | import static org.junit.Assert.assertEquals;
11 | import static org.junit.Assert.assertTrue;
12 | 
13 | public class BloombergExtractorTest extends BaseArticleExtractorTest {
14 | 
15 | 
16 |     @Test
17 |     public void testBloomberg1() throws Exception {
18 |         String html = loadArticle("bloomberg1");
19 |         String url = "http://www.bloomberg.com/news/articles/2016-09-08/japan-index-futures-signal-bounce-as-ecb-outlook-weighs-on-bonds";
20 |         HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, bloombergSource(), null);
21 |         HttpArticle article = parseResult.getArticle();
22 |         assertEquals("Stocks Sink With Bonds, Dollar Rallies as Complacency Broken", article.getTitle());
23 |         assertTrue(article.getText().contains("erted declines of this size in stocks and bonds are rare though not "));
24 |         assertTrue(article.getText().startsWith("Tranquility that has enveloped global"));
25 |         assertEquals(parseResult.getPublishedMatches().get(0), "META:parsely-pub-date");
26 |         DateTime actualPublished = article.getPublished();
27 |         DateTime expectedPublished = new DateTime(2016, 9, 8, 23, 14, 29, 36, DateTimeZone.UTC);
28 |         assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
29 |     }
30 | 
31 |     private HttpSource bloombergSource() {
32 |         HttpSource source = new HttpSource();
33 |         return source;
34 |     }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/CyberscoopExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
 4 | import lt.tokenmill.crawling.data.HttpSource;
 5 | import org.jsoup.Jsoup;
 6 | import org.jsoup.nodes.Document;
 7 | import org.junit.Test;
 8 | 
 9 | import java.util.Arrays;
10 | 
11 | import static org.junit.Assert.assertEquals;
12 | 
13 | public class CyberscoopExtractorTest extends BaseArticleExtractorTest {
14 | 
15 |     private static final String TITLE_SELECTOR = "h1.article__title";
16 | 
17 |     private HttpSource cyberscoopSourceWithoutTitleSelector() {
18 |         HttpSource source = new HttpSource();
19 |         return source;
20 |     }
21 | 
22 |     private HttpSource cyberscoopSourceWithTitleSelector() {
23 |         HttpSource source = new HttpSource();
24 |         source.setTitleSelectors(Arrays.asList(TITLE_SELECTOR));
25 |         return source;
26 |     }
27 | 
28 |     @Test
29 |     public void testTitleExtraction000() throws Exception {
30 |         String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/";
31 |         String html = loadArticle("cyberscoop1");
32 |         Document document = Jsoup.parse(html, url);
33 |         HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithoutTitleSelector(), null);
34 |         assertEquals(1, article.getTitleMatches().size());
35 |         assertEquals("META:og:title", article.getTitleMatches().get(0));
36 |     }
37 | 
38 |     @Test
39 |     public void testTitleExtraction001() throws Exception {
40 |         String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/";
41 |         String html = loadArticle("cyberscoop1");
42 |         Document document = Jsoup.parse(html, url);
43 |         HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithTitleSelector(), null);
44 |         assertEquals(1, article.getTitleMatches().size());
45 |         assertEquals(TITLE_SELECTOR, article.getTitleMatches().get(0));
46 |     }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/FortuneExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpArticle;
 4 | import lt.tokenmill.crawling.data.HttpSource;
 5 | import org.junit.Test;
 6 | 
 7 | import static junit.framework.TestCase.assertEquals;
 8 | 
 9 | public class FortuneExtractorTest extends BaseArticleExtractorTest {
10 | 
11 |     @Test
12 |     public void testFortune1() throws Exception {
13 |         String html = loadArticle("fortune1");
14 |         String url = "http://fortune.com/2017/04/13/susan-fowler-uber-editor-stripe/";
15 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, fortuneSource(), "2017/04/13");
16 |         assertEquals("2017-04-13T00:00:00.000Z", article.getPublished().toInstant().toString());
17 |     }
18 | 
19 |     private HttpSource fortuneSource() {
20 |         HttpSource source = new HttpSource();
21 |         return source;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/InvestingParserTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import lt.tokenmill.crawling.data.HttpArticle;
 5 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
 6 | import lt.tokenmill.crawling.data.HttpSource;
 7 | import org.joda.time.DateTime;
 8 | import org.joda.time.DateTimeZone;
 9 | import org.junit.Test;
10 | 
11 | import static org.junit.Assert.assertEquals;
12 | import static org.junit.Assert.assertTrue;
13 | 
14 | public class InvestingParserTest  extends BaseArticleExtractorTest {
15 | 
16 | 
17 |     @Test
18 |     public void testInvesting1() throws Exception {
19 |         String html = loadArticle("investing1");
20 |         String url = "https://www.investing.com/analysis/opening-bell:-brexit,-davos-meetings-are-today%E2%80%99s-big-drivers-200172664";
21 |         HttpArticleParseResult result = ArticleExtractor.extractArticleWithDetails(html, url, investingSource(), null);
22 |         HttpArticle article = result.getArticle();
23 |         assertEquals("Opening Bell: USD Drops, Pound Pops, Yen Soars", article.getTitle());
24 |         assertTrue(article.getText().startsWith("by Eli Wright\nAs markets in the US return from the long holiday weekend"));
25 |         assertTrue(article.getText().endsWith("ab Corporation (NYSE:SCHW) expects EPS of $0.36."));
26 |         DateTime actualPublished = article.getPublished();
27 |         DateTime expectedPublished = new DateTime(2017, 1, 17, 11, 8, 00, 00, DateTimeZone.UTC);
28 |         assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
29 |     }
30 | 
31 |     private HttpSource investingSource() {
32 |         HttpSource source = new HttpSource();
33 |         source.setTextSelectors(Lists.newArrayList("#contentSection p, #contentSection li"));
34 |         source.setDateSelectors(Lists.newArrayList(".contentSectionDetails span"));
35 |         source.setDateRegexps(Lists.newArrayList(".*\\((.+)\\).*"));
36 |         return source;
37 |     }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/KedainietisTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import lt.tokenmill.crawling.data.HttpArticle;
 4 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
 5 | import lt.tokenmill.crawling.data.HttpSource;
 6 | import org.junit.Test;
 7 | 
 8 | import java.util.Arrays;
 9 | 
10 | import static org.junit.Assert.assertEquals;
11 | import static org.junit.Assert.assertNotNull;
12 | import static org.junit.Assert.assertTrue;
13 | 
14 | public class KedainietisTest extends BaseArticleExtractorTest{
15 | 
16 |     private HttpSource kedainietisSource() {
17 |         HttpSource source = new HttpSource();
18 |         source.setLanguage("lt");
19 |         source.setDateSelectors(Arrays.asList("span.dtreviewed"));
20 |         return source;
21 |     }
22 | 
23 |     @Test
24 |     public void testKedainietis() throws Exception {
25 |         String html = loadArticle("kedainietis");
26 |         String url = "http://www.kedainietis.lt/naujienos/naujienos/nedeklaravus-gyvenamosios-vietos-nepasieks-ir-sodros-mokami-alimentai-17694/";
27 |         HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, kedainietisSource(), null);
28 |         HttpArticle article = parseResult.getArticle();
29 |         assertEquals("Nedeklaravus gyvenamosios vietos, nepasieks ir „Sodros“ mokami alimentai".trim(), article.getTitle().trim());
30 |         assertTrue(article.getText().contains("valstybės biudžeto Lietuvoje"));
31 |         assertTrue(article.getText().startsWith("Iš valstybės"));
32 |         assertEquals(parseResult.getPublishedMatches().get(0), "span.dtreviewed");
33 |         assertNotNull(article.getPublished());
34 |     }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/ReutersExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import lt.tokenmill.crawling.data.HttpArticle;
 5 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
 6 | import lt.tokenmill.crawling.data.HttpSource;
 7 | import org.joda.time.DateTime;
 8 | import org.joda.time.DateTimeZone;
 9 | import org.junit.Test;
10 | 
11 | import static org.junit.Assert.assertEquals;
12 | import static org.junit.Assert.assertNull;
13 | import static org.junit.Assert.assertTrue;
14 | 
15 | public class ReutersExtractorTest extends BaseArticleExtractorTest {
16 | 
17 | 
18 |     @Test
19 |     public void testReuters1() throws Exception {
20 |         String html = loadArticle("reuters1");
21 |         String url = "http://www.reuters.com/finance/stocks/TEX/key-developments/article/3414284";
22 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null);
23 |         assertEquals("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring- CNBC, citing source", article.getTitle());
24 |         assertTrue(article.getText().contains("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring; Marcato supports Terex CEO - CNBC, citing source"));
25 |         DateTime actualPublished = article.getPublished();
26 |         DateTime expectedPublished = new DateTime(2016, 7, 28, 15, 35, DateTimeZone.UTC);
27 |         assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
28 |     }
29 | 
30 |     @Test
31 |     public void testReuters2() throws Exception {
32 |         String html = loadArticle("reuters2");
33 |         String url = "http://www.reuters.com/article/idUSFWN1B40B5";
34 |         HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, reutersSource(), null);
35 |         HttpArticle article = parseResult.getArticle();
36 |         assertEquals("BRIEF-Canadian Solar unit Recurrent Energy reached commercial operation of 100 MWac/134 MWp", article.getTitle());
37 |         assertTrue(article.getText().contains("Unit Recurrent Energy has reached commercial operation of 100 MWac/134 MWp Mustang solar power project"));
38 |         assertEquals("LD+JSON", parseResult.getPublishedMatches().get(0));
39 |         DateTime expectedPublished = new DateTime(2016, 8, 23, 12, 24, 3, DateTimeZone.UTC);
40 |         DateTime actualPublished = article.getPublished();
41 |         assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
42 |     }
43 | 
44 |     @Test
45 |     public void testReuters3() throws Exception {
46 |         String html = loadArticle("reuters3");
47 |         String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
48 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null);
49 |         assertEquals("Tesla touts speed and driving range with new upgraded battery", article.getTitle());
50 |         assertTrue(article.getText().contains(" models. But Musk said those were both millio"));
51 |         DateTime expectedPublished = new DateTime(2016, 8, 23, 22, 41, 57, DateTimeZone.UTC);
52 |         DateTime actualPublished = article.getPublished();
53 |         assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
54 |     }
55 | 
56 |     @Test
57 |     public void testReutersBlog1() throws Exception {
58 |         String html = loadArticle("reuters-blogs1");
59 |         String url = "http://blogs.reuters.com/breakingviews/2016/08/22/pfizer-bets-14-bln-it-knows-better-than-market/";
60 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersBlogsSource(), null);
61 |         assertEquals("Pfizer bets $14 bln it knows better than market", article.getTitle());
62 |         assertTrue(article.getText().contains("r may believe in a far more lucrative outcom"));
63 |         DateTime actualPublished = article.getPublished();
64 |         assertNull(actualPublished);
65 |     }
66 | 
67 | 
68 |     private HttpSource reutersSource() {
69 |         HttpSource source = new HttpSource();
70 |         source.setTitleSelectors(Lists.newArrayList("h1"));
71 |         source.setDateSelectors(Lists.newArrayList("#sigDevArticleText .timestamp"));
72 |         source.setTextSelectors(Lists.newArrayList("#article-text p"));
73 |         return source;
74 |     }
75 | 
76 |     private HttpSource reutersBlogsSource() {
77 |         HttpSource source = new HttpSource();
78 |         source.setTextSelectors(Lists.newArrayList("#postcontent p"));
79 |         return source;
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.urls;
 2 | 
 3 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest;
 4 | import org.jsoup.Jsoup;
 5 | import org.jsoup.nodes.Document;
 6 | import org.junit.Test;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | 
10 | public class UrlExtractorTest extends BaseArticleExtractorTest {
11 | 
12 |     @Test
13 |     public void testExtraction00() throws Exception {
14 |         String html = loadArticle("aljazeera1");
15 |         String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
16 |         Document document = Jsoup.parse(html);
17 |         assertEquals(url, UrlExtractor.extract(url, document));
18 |         assertEquals("https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html", UrlExtractor.extract("", document));
19 |     }
20 | 
21 |     @Test
22 |     public void testExtraction01() throws Exception {
23 |         String html = loadArticle("kedainietis");
24 |         String url = "url";
25 |         Document document = Jsoup.parse(html);
26 |         assertEquals(url, UrlExtractor.extract(url, document));
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlFiltersTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.urls;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import java.util.Arrays;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | 
 9 | public class UrlFiltersTest {
10 | 
11 |     @Test
12 |     public void testURLNormalizer000() {
13 |         UrlFilters urlFilters = UrlFilters.create(Arrays.asList("a-->>b"), Arrays.asList());
14 |         assertEquals("bbbb", urlFilters.filterWithDetails("aaaa").getNormalized());
15 |         assertEquals("bbbb", urlFilters.filterWithDetails("abba").getNormalized());
16 | 
17 |         urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList());
18 |         String url = "http://www.tokenmill.lt/#case-understand";
19 |         assertEquals("http://www.tokenmill.lt/", urlFilters.filterWithDetails(url).getNormalized());
20 |     }
21 | 
22 |     @Test
23 |     public void testURLFilters000() {
24 |         String url = "http://www.tokenmill.lt/#case-understand";
25 |         UrlFilters urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList("+^http://www.tokenmill.lt/.*", "-.*apache.*"));
26 |         UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url);
27 |         assertEquals(true, filteringResult.getAccepted());
28 |         assertEquals("+^http://www.tokenmill.lt/.*", filteringResult.getFilter());
29 |         assertEquals(1, filteringResult.getNormalizers().size());
30 |         assertEquals("http://www.tokenmill.lt/", filteringResult.getNormalized());
31 | 
32 |         assertEquals("http://www.tokenmill.lt/", urlFilters.filter(url));
33 |         assertEquals(null, urlFilters.filter("http://nutch.apache.org/"));
34 | 
35 |         filteringResult = urlFilters.filterWithDetails("http://nutch.apache.org/");
36 |         assertEquals(false, filteringResult.getAccepted());
37 |         assertEquals("-.*apache.*", filteringResult.getFilter());
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/utils/HttpSourceTesterTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.utils;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import lt.tokenmill.crawling.data.DataUtils;
 5 | import lt.tokenmill.crawling.data.HttpArticle;
 6 | import lt.tokenmill.crawling.data.HttpSource;
 7 | import lt.tokenmill.crawling.data.HttpSourceTest;
 8 | import lt.tokenmill.crawling.parser.ArticleExtractor;
 9 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest;
10 | import org.junit.Test;
11 | 
12 | import java.util.Map;
13 | 
14 | import static org.junit.Assert.assertEquals;
15 | 
16 | public class HttpSourceTesterTest extends BaseArticleExtractorTest {
17 | 
18 |     @Test
19 |     public void exactMatch() throws Exception {
20 |         String html = loadArticle("reuters3");
21 |         String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
22 | 
23 |         HttpSource source = new HttpSource();
24 |         source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$"));
25 | 
26 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null);
27 | 
28 |         HttpSourceTest sourceTest = new HttpSourceTest();
29 |         sourceTest.setHtml(html);
30 |         sourceTest.setUrl(url);
31 |         sourceTest.setUrlAccepted(true);
32 |         sourceTest.setTitle(article.getTitle());
33 |         sourceTest.setDate(DataUtils.formatInUTC(article.getPublished()));
34 |         sourceTest.setText(article.getText());
35 | 
36 |         Map<String, HttpSourceTester.Difference> differences = HttpSourceTester.test(source, sourceTest);
37 |         assertEquals(0, differences.size());
38 |     }
39 | 
40 |     @Test
41 |     public void allDifferent() throws Exception {
42 |         String html = loadArticle("reuters3");
43 |         String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
44 | 
45 |         HttpSource source = new HttpSource();
46 |         source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$"));
47 | 
48 |         HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null);
49 | 
50 |         HttpSourceTest sourceTest = new HttpSourceTest();
51 |         sourceTest.setHtml(html);
52 |         sourceTest.setUrl(url);
53 |         sourceTest.setUrlAccepted(false);
54 |         sourceTest.setTitle("Title");
55 |         sourceTest.setDate("Published");
56 |         sourceTest.setText("Text");
57 | 
58 |         Map<String, HttpSourceTester.Difference> differences = HttpSourceTester.test(source, sourceTest);
59 |         assertEquals(4, differences.size());
60 |         assertEquals("false", differences.get(HttpSourceTester.URL_ACCEPTED).getExpected());
61 |         assertEquals("true", differences.get(HttpSourceTester.URL_ACCEPTED).getActual());
62 |         assertEquals("Title", differences.get(HttpSourceTester.TITLE).getExpected());
63 |         assertEquals(article.getTitle(), differences.get(HttpSourceTester.TITLE).getActual());
64 |         assertEquals("Published", differences.get(HttpSourceTester.DATE).getExpected());
65 |         assertEquals(DataUtils.formatInUTC(article.getPublished()), differences.get(HttpSourceTester.DATE).getActual());
66 |         assertEquals("Text", differences.get(HttpSourceTester.TEXT).getExpected());
67 |         assertEquals(article.getText(), differences.get(HttpSourceTester.TEXT).getActual());
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/utils/QueryParserTest.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.parser.utils;
 2 | 
 3 | import com.google.common.collect.Lists;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | 
10 | public class QueryParserTest {
11 | 
12 |     @Test
13 |     public void parseQuery() {
14 |         List<String> parts = QueryParser.parseQuery("+Turkey-Inflation");
15 |         assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts);
16 | 
17 |         parts = QueryParser.parseQuery("+Turkey -Inflation");
18 |         assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts);
19 | 
20 |         parts = QueryParser.parseQuery("Turkey -Inflation");
21 |         assertEquals(Lists.newArrayList("Turkey", "-Inflation"), parts);
22 | 
23 |         parts = QueryParser.parseQuery("+Turkey attack");
24 |         assertEquals(Lists.newArrayList("+Turkey", "attack"), parts);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/parser/src/test/resources/jsonld/bbc-1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "http:\/\/schema.org",
 3 |   "@type": "Article",
 4 |   "url": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745",
 5 |   "publisher": {
 6 |     "@type": "Organization",
 7 |     "name": "BBC News",
 8 |     "logo": {
 9 |       "@type": "ImageObject",
10 |       "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1"
11 |     }
12 |   },
13 |   "datePublished": "2017-08-30T10:32:11+01:00",
14 |   "dateModified": "2017-08-30T10:32:11+01:00",
15 |   "headline": "Venezuela: New assembly approves treason trials for opposition",
16 |   "image": {
17 |     "@type": "ImageObject",
18 |     "width": 720,
19 |     "height": 405,
20 |     "url": "https:\/\/ichef-1.bbci.co.uk\/news\/720\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg"
21 |   },
22 |   "thumbnailUrl": "https:\/\/ichef.bbci.co.uk\/news\/208\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg",
23 |   "author": {
24 |     "@type": "Organization",
25 |     "name": "BBC News",
26 |     "logo": {
27 |       "@type": "ImageObject",
28 |       "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1"
29 |     }
30 |   },
31 |   "mainEntityOfPage": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745"
32 | }


--------------------------------------------------------------------------------
/ui-commons/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>crawling-framework</artifactId>
 7 |         <groupId>lt.tokenmill.crawling</groupId>
 8 |         <version>0.3.4-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>ui-commons</artifactId>
13 | 
14 |     <dependencies>
15 |         <dependency>
16 |             <groupId>lt.tokenmill.crawling</groupId>
17 |             <artifactId>elasticsearch</artifactId>
18 |         </dependency>
19 |         <dependency>
20 |             <groupId>lt.tokenmill.crawling</groupId>
21 |             <artifactId>parser</artifactId>
22 |         </dependency>
23 |     </dependencies>
24 | 
25 |     <profiles>
26 |         <profile>
27 |             <id>release</id>
28 |             <build>
29 |                 <plugins>
30 |                     <plugin>
31 |                         <groupId>org.apache.maven.plugins</groupId>
32 |                         <artifactId>maven-source-plugin</artifactId>
33 |                     </plugin>
34 | 
35 |                     <plugin>
36 |                         <groupId>org.apache.maven.plugins</groupId>
37 |                         <artifactId>maven-jar-plugin</artifactId>
38 |                     </plugin>
39 | 
40 |                     <plugin>
41 |                         <groupId>org.apache.maven.plugins</groupId>
42 |                         <artifactId>maven-javadoc-plugin</artifactId>
43 |                     </plugin>
44 |                 </plugins>
45 |             </build>
46 |         </profile>
47 |     </profiles>
48 | 
49 | </project>


--------------------------------------------------------------------------------
/ui-commons/src/main/java/lt/tokenmill/crawling/commonui/Configuration.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.commonui;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.IOException;
 6 | import java.util.Properties;
 7 | 
 8 | public class Configuration {
 9 | 
10 |     public static final Configuration INSTANCE = new Configuration();
11 | 
12 |     private static final String DEFAULT_CONFIG_FILE_LOCATION = "conf/development.properties";
13 |     private final Properties properties = new Properties();
14 | 
15 |     private Configuration() {
16 |         try {
17 |             properties.load(new FileInputStream(new File(System.getProperty("config", DEFAULT_CONFIG_FILE_LOCATION))));
18 |         } catch (IOException e) {
19 |             throw new RuntimeException(e);
20 |         }
21 |     }
22 | 
23 |     public String getString(String key, String defaultValue) {
24 |         return properties.getProperty(key, defaultValue);
25 |     }
26 | 
27 |     public int getInt(String key, int defaultValue) {
28 |         return Integer.parseInt(properties.getProperty(key, Integer.toString(defaultValue)));
29 |     }
30 | 
31 |     public String getString(String key) {
32 |         return properties.getProperty(key);
33 |     }
34 | 
35 |     public int getInt(String key) {
36 |         return Integer.parseInt(properties.getProperty(key));
37 |     }
38 | 
39 |     @Override
40 |     public String toString() {
41 |         return "Configuration{" +
42 |                 "properties='" + properties + "'" +
43 |                 "}";
44 |     }
45 | }


--------------------------------------------------------------------------------
/ui-commons/src/main/java/lt/tokenmill/crawling/commonui/ElasticSearch.java:
--------------------------------------------------------------------------------
 1 | package lt.tokenmill.crawling.commonui;
 2 | 
 3 | import lt.tokenmill.crawling.es.*;
 4 | 
 5 | public class ElasticSearch {
 6 | 
 7 |     private static ElasticConnection CONNECTION;
 8 |     private static EsHttpSourceOperations HTTP_SOURCE_OPERATIONS;
 9 |     private static EsHttpSourceTestOperations HTTP_SOURCE_TEST_OPERATIONS;
10 |     private static EsNamedQueryOperations NAMED_QUERY_OPERATIONS;
11 |     private static EsDocumentOperations DOCUMENT_OPERATIONS;
12 |     private static EsHttpUrlOperations URL_OPERATIONS;
13 | 
14 |     public static EsHttpSourceOperations getHttpSourceOperations() {
15 |         if (HTTP_SOURCE_OPERATIONS == null) {
16 |             String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM);
17 |             String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM);
18 |             HTTP_SOURCE_OPERATIONS = EsHttpSourceOperations.getInstance(getEsConnection(), index, type);
19 |         }
20 |         return HTTP_SOURCE_OPERATIONS;
21 |     }
22 | 
23 |     public static EsHttpSourceTestOperations getHttpSourceTestOperations() {
24 |         if (HTTP_SOURCE_TEST_OPERATIONS == null) {
25 |             String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM);
26 |             String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_TYPE_PARAM);
27 |             HTTP_SOURCE_TEST_OPERATIONS = EsHttpSourceTestOperations.getInstance(getEsConnection(), index, type);
28 |         }
29 |         return HTTP_SOURCE_TEST_OPERATIONS;
30 |     }
31 | 
32 |     public static EsNamedQueryOperations getNamedQueryOperations() {
33 |         if (NAMED_QUERY_OPERATIONS == null) {
34 |             String index = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_INDEX_PARAM);
35 |             String type = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_TYPE_PARAM);
36 |             NAMED_QUERY_OPERATIONS = EsNamedQueryOperations.getInstance(getEsConnection(), index, type);
37 |         }
38 |         return NAMED_QUERY_OPERATIONS;
39 |     }
40 | 
41 | 
42 |     public static EsDocumentOperations getDocumentOperations() {
43 |         if (DOCUMENT_OPERATIONS == null) {
44 |             String index = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_INDEX_NAME_PARAM);
45 |             String type = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_DOC_TYPE_PARAM);
46 |             DOCUMENT_OPERATIONS = EsDocumentOperations.getInstance(getEsConnection(), index, type);
47 |         }
48 |         return DOCUMENT_OPERATIONS;
49 |     }
50 | 
51 |     public static EsHttpUrlOperations getUrlOperations() {
52 |         if (URL_OPERATIONS == null) {
53 |             String index = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_INDEX_NAME_PARAM);
54 |             String type = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_DOC_TYPE_PARAM);
55 |             URL_OPERATIONS = EsHttpUrlOperations.getInstance(getEsConnection(), index, type);
56 |         }
57 |         return URL_OPERATIONS;
58 |     }
59 | 
60 |     private static ElasticConnection getEsConnection() {
61 |         if (CONNECTION == null) {
62 |             String hostname = Configuration.INSTANCE.getString(ElasticConstants.ES_HOSTNAME_PARAM, "localhost");
63 |             int restPort = Configuration.INSTANCE.getInt(ElasticConstants.ES_REST_PORT, 9200);
64 |             String restScheme =  Configuration.INSTANCE.getString(ElasticConstants.ES_REST_SCHEME, "http");
65 |             CONNECTION = ElasticConnection.getConnection(hostname, restPort, restScheme);
66 |         }
67 |         return CONNECTION;
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------