├── .gitattributes
├── .github
└── tokenmill-logo.svg
├── .gitignore
├── .gitlab-ci.yml
├── Dockerfile.base
├── Dockerfile.crawler
├── Dockerfile.es
├── Dockerfile.ui
├── LICENSE
├── Makefile
├── README.md
├── administration-ui
├── conf
│ ├── development.properties
│ └── docker-compose.properties
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── lt
│ │ │ └── tokenmill
│ │ │ └── crawling
│ │ │ └── adminui
│ │ │ ├── Application.java
│ │ │ ├── CrawlerAdminUI.java
│ │ │ ├── HttpSourceTestsCache.java
│ │ │ ├── utils
│ │ │ ├── CSVUtils.java
│ │ │ ├── GridUtils.java
│ │ │ ├── HttpSourceCSVUtils.java
│ │ │ └── HttpSourceTestCSVUtils.java
│ │ │ └── view
│ │ │ ├── BaseView.java
│ │ │ ├── HttpSourceForm.java
│ │ │ ├── HttpSourceStatsWindow.java
│ │ │ ├── HttpSourceTestWindow.java
│ │ │ ├── HttpSourcesView.java
│ │ │ ├── ImportExportView.java
│ │ │ ├── imports
│ │ │ ├── HttpSourceImportExport.java
│ │ │ ├── HttpSourceTestImportExport.java
│ │ │ └── NamedQueryImportExport.java
│ │ │ ├── namedquery
│ │ │ ├── NamedQueriesView.java
│ │ │ ├── NamedQueryFormWindow.java
│ │ │ └── NamedQueryResultsPanel.java
│ │ │ ├── pageanalysis
│ │ │ └── PageAnalysisView.java
│ │ │ └── sourcetest
│ │ │ ├── HttpSourceAllTestsWindow.java
│ │ │ ├── HttpSourceTestFormWindow.java
│ │ │ ├── HttpSourceTestsView.java
│ │ │ └── TestResultsPanel.java
│ ├── resources
│ │ ├── log4j.properties
│ │ └── log4j2.properties
│ └── webapp
│ │ └── VAADIN
│ │ └── themes
│ │ └── crawleradmintheme
│ │ ├── addons.scss
│ │ ├── crawleradmintheme.scss
│ │ ├── styles.css
│ │ └── styles.scss
│ └── test
│ ├── java
│ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── adminui
│ │ └── utils
│ │ ├── HttpSourceTestCSVUtilsTest.java
│ │ └── HttpSourcesCSVUtilsTest.java
│ └── resources
│ └── www.tokenmill.lt.html
├── analysis-ui
├── conf
│ └── development.properties
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── analysisui
│ │ ├── AnalysisUI.java
│ │ ├── Application.java
│ │ ├── search
│ │ └── ResultPanel.java
│ │ └── view
│ │ ├── BaseView.java
│ │ ├── ContextCloudView.java
│ │ └── SearchView.java
│ ├── resources
│ └── log4j.properties
│ └── webapp
│ └── VAADIN
│ └── themes
│ └── analysistheme
│ ├── addons.scss
│ ├── analysistheme.scss
│ ├── styles.css
│ └── styles.scss
├── bin
├── create-es-index.sh
├── create-es-indices.sh
├── deploy-crawler.sh
├── run-administration-ui.sh
├── run-analysis-ui.sh
└── run-crawler.sh
├── crawler
├── conf
│ ├── docker-compose.yaml
│ └── local.yaml
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── lt
│ │ │ └── tokenmill
│ │ │ └── crawling
│ │ │ └── crawler
│ │ │ ├── CrawlerConstants.java
│ │ │ ├── CrawlerTopology.java
│ │ │ ├── DefaultServiceProvider.java
│ │ │ ├── ServiceProvider.java
│ │ │ ├── bolt
│ │ │ ├── ArticleIndexerBolt.java
│ │ │ ├── LinkExtractorBolt.java
│ │ │ └── StatusUpdaterBolt.java
│ │ │ ├── spout
│ │ │ ├── HttpSourceConfiguration.java
│ │ │ └── UrlGeneratorSpout.java
│ │ │ └── utils
│ │ │ ├── PrioritizedSource.java
│ │ │ ├── UrlFilterUtils.java
│ │ │ └── UrlFiltersCache.java
│ └── resources
│ │ ├── urlfilters.json
│ │ └── urlfilters.txt
│ └── test
│ └── java
│ └── lt
│ └── tokenmill
│ └── crawling
│ └── crawler
│ └── spout
│ ├── UrlFilterUtilsTest.java
│ └── UrlGeneratorSpoutTest.java
├── data-model
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── data
│ │ ├── DataUtils.java
│ │ ├── HighlightedSearchResult.java
│ │ ├── HtmlAnalysisResult.java
│ │ ├── HttpArticle.java
│ │ ├── HttpArticleParseResult.java
│ │ ├── HttpSource.java
│ │ ├── HttpSourceTest.java
│ │ ├── HttpUrl.java
│ │ ├── NamedQuery.java
│ │ └── PageableList.java
│ └── test
│ └── java
│ └── lt
│ └── tokenmill
│ └── crawling
│ └── data
│ └── DataUtilsTest.java
├── docker-compose.dev.yml
├── docker-compose.run.yml
├── elasticsearch
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── lt
│ │ │ └── tokenmill
│ │ │ └── crawling
│ │ │ └── es
│ │ │ ├── BaseElasticOps.java
│ │ │ ├── ElasticConnection.java
│ │ │ ├── ElasticConstants.java
│ │ │ ├── EsDataParser.java
│ │ │ ├── EsDocumentOperations.java
│ │ │ ├── EsHttpSourceOperations.java
│ │ │ ├── EsHttpSourceTestOperations.java
│ │ │ ├── EsHttpSourcesCache.java
│ │ │ ├── EsHttpUrlOperations.java
│ │ │ ├── EsNamedQueryOperations.java
│ │ │ ├── Utils.java
│ │ │ └── model
│ │ │ └── DateHistogramValue.java
│ └── resources
│ │ └── indices
│ │ ├── document.json
│ │ ├── http_source.json
│ │ ├── http_source_test.json
│ │ ├── query.json
│ │ └── url.json
│ └── test
│ ├── java
│ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── es
│ │ ├── ElasticConnectionTest.java
│ │ ├── ElasticsearchTestServer.java
│ │ ├── EsDocumentOperationsTest.java
│ │ ├── EsHttpSourceOperationsTest.java
│ │ ├── EsHttpSourceTestOperationsTest.java
│ │ ├── EsHttpUrlOperationsTestInt.java
│ │ ├── IndexManager.java
│ │ └── TestUtils.java
│ └── resources
│ ├── log4j.properties
│ ├── log4j2.properties
│ └── www.tokenmill.lt.html
├── page-analyzer
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── pageanalyzer
│ │ └── PageAnalyzer.java
│ └── test
│ ├── java
│ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── pageanalyzer
│ │ └── PageAnalyzerTest.java
│ └── resources
│ └── bloomberg.com.html
├── parser
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── parser
│ │ ├── ArticleExtractor.java
│ │ ├── DateParser.java
│ │ ├── PageAnalyzer.java
│ │ ├── TitleParser.java
│ │ ├── data
│ │ ├── MatchedDate.java
│ │ └── MatchedString.java
│ │ ├── urls
│ │ ├── UrlExtractor.java
│ │ └── UrlFilters.java
│ │ └── utils
│ │ ├── HttpSourceTester.java
│ │ ├── JsonLdParser.java
│ │ ├── QueryParser.java
│ │ ├── TextFilters.java
│ │ └── TextProfileSignature.java
│ └── test
│ ├── java
│ └── lt
│ │ └── tokenmill
│ │ └── crawling
│ │ └── parser
│ │ ├── AljazeeraExtractorTest.java
│ │ ├── BaseArticleExtractorTest.java
│ │ ├── BloombergExtractorTest.java
│ │ ├── CyberscoopExtractorTest.java
│ │ ├── DateParserTest.java
│ │ ├── FortuneExtractorTest.java
│ │ ├── InvestingParserTest.java
│ │ ├── JsonLdParserTest.java
│ │ ├── KedainietisTest.java
│ │ ├── ReutersExtractorTest.java
│ │ ├── urls
│ │ ├── UrlExtractorTest.java
│ │ └── UrlFiltersTest.java
│ │ └── utils
│ │ ├── HttpSourceTesterTest.java
│ │ ├── QueryParserTest.java
│ │ ├── TextFilterTest.java
│ │ └── TextProfileSignatureTest.java
│ └── resources
│ ├── articles
│ ├── aljazeera1.html
│ ├── bbc1.html
│ ├── bloomberg1.html
│ ├── cyberscoop1.html
│ ├── fortune1.html
│ ├── ft1.html
│ ├── investing1.html
│ ├── kedainietis.html
│ ├── nbcnews1.html
│ ├── reuters-blogs1.html
│ ├── reuters1.html
│ ├── reuters2.html
│ ├── reuters3.html
│ └── usanews1.html
│ └── jsonld
│ └── bbc-1.json
├── pom.xml
└── ui-commons
├── pom.xml
└── src
└── main
└── java
└── lt
└── tokenmill
└── crawling
└── commonui
├── Configuration.java
└── ElasticSearch.java
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-vendored
2 | *.css linguist-vendored
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | *.iml
4 | *.retry
5 |
6 | **/*.gwt.xml
7 | crawler/logs/
8 | **/.classpath
9 | **/.project
10 | **/.settings
11 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | stages:
2 | - base
3 | - test
4 | - build
5 |
6 | prepare-base-docker:
7 | stage: base
8 | image: docker:stable
9 | when: manual
10 | services:
11 | - docker:dind
12 | before_script:
13 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
14 | script:
15 | - docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/base:latest .
16 | - docker push registry.gitlab.com/tokenmill/crawling-framework/base:latest
17 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/base:latest
18 |
19 | prepare-base-elasticsearch:
20 | stage: base
21 | image: docker:stable
22 | when: manual
23 | services:
24 | - docker:dind
25 | before_script:
26 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
27 | script:
28 | - docker build -f Dockerfile.es -t registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest .
29 | - docker push registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
30 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
31 |
32 | prepare-administration-ui:
33 | stage: base
34 | image: docker:stable
35 | when: manual
36 | services:
37 | - docker:dind
38 | before_script:
39 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
40 | script:
41 | - docker build -f Dockerfile.ui -t registry.gitlab.com/tokenmill/crawling-framework/ui:latest .
42 | - docker push registry.gitlab.com/tokenmill/crawling-framework/ui:latest
43 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/ui:latest
44 |
45 | prepare-crawler:
46 | stage: base
47 | image: docker:stable
48 | when: manual
49 | services:
50 | - docker:dind
51 | before_script:
52 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
53 | script:
54 | - docker build -f Dockerfile.crawler -t registry.gitlab.com/tokenmill/crawling-framework/crawler:latest .
55 | - docker push registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
56 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
57 |
58 | unit-tests:
59 | stage: test
60 | image: registry.gitlab.com/tokenmill/crawling-framework/base:latest
61 | when: always
62 | script:
63 | - mvn clean test
64 |
65 | integration-tests:
66 | stage: test
67 | image: registry.gitlab.com/tokenmill/crawling-framework/base:latest
68 | services:
69 | - name: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
70 | alias: elasticsearch
71 | when: always
72 | script:
73 | - mvn -Dtest=*TestInt -DfailIfNoTests=false clean test
74 |
--------------------------------------------------------------------------------
/Dockerfile.base:
--------------------------------------------------------------------------------
1 | FROM maven:3.5.4-jdk-8-alpine as builder
2 |
3 | RUN mkdir -p /usr/src/cf
4 | WORKDIR /usr/src/cf
5 |
6 | COPY . .
7 |
8 | RUN mvn clean install
9 |
10 | FROM maven:3.5.4-jdk-8-alpine
11 | COPY --from=builder /root/.m2/ /root/.m2/
12 |
--------------------------------------------------------------------------------
/Dockerfile.crawler:
--------------------------------------------------------------------------------
1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder
2 |
3 | RUN mkdir -p /usr/src/cf
4 | WORKDIR /usr/src/cf
5 |
6 | COPY . .
7 |
8 | RUN cd crawler && \
9 | mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests
10 |
11 | FROM maven:3.5.4-jdk-8-alpine
12 | RUN mkdir -p /usr/src/cf
13 | WORKDIR /usr/src/cf
14 |
15 | COPY --from=builder /usr/src/cf/crawler/target/crawler-standalone.jar crawler-standalone.jar
16 | COPY --from=builder /usr/src/cf/crawler/conf/docker-compose.yaml docker-compose.yaml
17 |
18 | CMD ["java", "-cp", "crawler-standalone.jar", "lt.tokenmill.crawling.crawler.CrawlerTopology", "-local", "-conf", "docker-compose.yaml"]
19 |
--------------------------------------------------------------------------------
/Dockerfile.es:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 as builder
2 |
3 | ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/e1f115e4ca285c3c24e847c4dd4be955e0ed51c2/wait-for-it.sh /utils/wait-for-it.sh
4 |
5 | COPY bin/ bin/
6 | COPY elasticsearch/ elasticsearch/
7 |
8 | RUN /usr/local/bin/docker-entrypoint.sh elasticsearch -p /tmp/epid & /bin/bash /utils/wait-for-it.sh -t 0 localhost:9200 -- \
9 | ./bin/create-es-indices.sh ; \
10 | kill $(cat /tmp/epid) && wait $(cat /tmp/epid); exit 0;
11 |
12 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0
13 |
14 | COPY --from=builder /usr/share/elasticsearch/data /usr/share/elasticsearch/data
15 |
--------------------------------------------------------------------------------
/Dockerfile.ui:
--------------------------------------------------------------------------------
1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder
2 |
3 | RUN mkdir -p /usr/src/cf
4 | WORKDIR /usr/src/cf
5 |
6 | COPY . .
7 |
8 | RUN cd administration-ui && mvn clean package -Pbigjar
9 |
10 | FROM maven:3.5.4-jdk-8-alpine
11 | RUN mkdir -p /usr/src/cf
12 | WORKDIR /usr/src/cf
13 |
14 | COPY --from=builder /usr/src/cf/administration-ui/target/administration-ui-standalone.jar administration-ui-standalone.jar
15 | COPY --from=builder /usr/src/cf/administration-ui/conf/docker-compose.properties docker-compose.properties
16 |
17 | CMD ["java", "-Dconfig=docker-compose.properties", "-jar", "administration-ui-standalone.jar"]
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2017-2019 Tokenmill, UAB
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | unit-test:
2 | mvn clean test
3 |
4 | run-dev-env:
5 | docker-compose -f docker-compose.dev.yml pull && \
6 | docker-compose -f docker-compose.dev.yml down && \
7 | docker-compose -f docker-compose.dev.yml build && \
8 | docker-compose -f docker-compose.dev.yml up --remove-orphans
9 |
10 | build-base-docker:
11 | docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/deps:latest .
12 |
13 | publish-base-docker: build-base-docker
14 | docker push registry.gitlab.com/tokenmill/crawling-framework/deps:latest
15 |
16 | run-framework:
17 | docker-compose -f docker-compose.run.yml pull && \
18 | docker-compose -f docker-compose.run.yml down && \
19 | docker-compose -f docker-compose.run.yml build && \
20 | docker-compose -f docker-compose.run.yml up --remove-orphans
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # Crawling Framework
6 |
7 | [](https://search.maven.org/search?q=g:%22lt.tokenmill.crawling%22%20AND%20a:%22crawling-framework%22)
8 | [](https://gitlab.com/tokenmill/crawling-framework/commits/master)
9 |
10 | Crawling Framework aims at providing instruments to configure and run your [Storm Crawler](http://stormcrawler.net/) based crawler. It mainly aims at easing crawling of article content publishing sites like news portals or blog sites. With the help of GUI tool Crawling Framework provides you can:
11 |
12 | 1. Specify which sites to crawl.
13 | 1. Configure URL inclusion and exclusion filters, thus controlling which sections of the site will be fetched.
14 | 1. Specify which elements of the page provide information about article publication name, its title and main body.
15 | 1. Define tests which validate that extraction rules are working.
16 |
17 | Once configuration is done the Crawling Framework runs [Storm Crawler](http://stormcrawler.net/) based crawling following the rules specified in the configuration.
18 |
19 | ## Introduction
20 |
21 | We have recorded a video on how to setup and use Crawling Framework. Click on the image below to watch in on Youtube.
22 |
23 | [](https://www.youtube.com/watch?v=AvO4lmmIuis)
24 |
25 | ## Requirements
26 |
27 | Framework writes its configuration and stores crawled data to ElasticSearch. Before starting crawl project [install ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html) (Crawling Framework is tested to work with Elastic v7.x).
28 |
29 | Crawling Framework is a Java lib which will have to be extended to run Storm Crawler topology, thus Java (JDK8, Maven) infrastructure will be needed.
30 |
31 | ### Using password protected ElasticSearch
32 |
33 | Some providers hide ElasticSearch under authentification step (Which makes sense). Just set environment variables `ES_USERNAME` and `ES_PASSWORD` accordingly, everything else can remain the same. Authentification step will be done implicitly if proper credentials are there
34 |
35 | ## Configuring and Running a crawl
36 |
37 | See [Crawling Framework Example](https://github.com/tokenmill/crawling-framework-example) project's documentation.
38 |
39 |
40 | ## License
41 |
42 | Copyright © 2017-2019 [TokenMill UAB](http://www.tokenmill.ai).
43 |
44 | Distributed under the The Apache License, Version 2.0.
45 |
--------------------------------------------------------------------------------
/administration-ui/conf/development.properties:
--------------------------------------------------------------------------------
1 | port=8081
2 | es.hostname=localhost
3 | es.transport.port=9300
4 | es.httpsource.index.name=http_sources
5 | es.httpsource.doc.type=http_source
6 | es.httpsourcetest.index.name=http_source_tests
7 | es.httpsourcetest.doc.type=http_source_test
8 | es.namedqueries.index.name=named_queries
9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc
12 | es.urls.index.name=urls
13 | es.urls.doc.type=url
--------------------------------------------------------------------------------
/administration-ui/conf/docker-compose.properties:
--------------------------------------------------------------------------------
1 | port=8081
2 | es.hostname=elasticsearch
3 | es.transport.port=9300
4 | es.httpsource.index.name=http_sources
5 | es.httpsource.doc.type=http_source
6 | es.httpsourcetest.index.name=http_source_tests
7 | es.httpsourcetest.doc.type=http_source_test
8 | es.namedqueries.index.name=named_queries
9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc
12 | es.urls.index.name=urls
13 | es.urls.doc.type=url
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/Application.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui;
2 |
3 | import com.vaadin.server.VaadinServlet;
4 | import lt.tokenmill.crawling.commonui.Configuration;
5 | import org.eclipse.jetty.security.*;
6 | import org.eclipse.jetty.security.authentication.BasicAuthenticator;
7 | import org.eclipse.jetty.server.Server;
8 | import org.eclipse.jetty.servlet.ServletContextHandler;
9 | import org.eclipse.jetty.servlet.ServletHolder;
10 | import org.eclipse.jetty.util.security.Constraint;
11 | import org.eclipse.jetty.util.security.Credential;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 |
15 |
16 | public class Application {
17 |
18 | private static final Logger LOG = LoggerFactory.getLogger(Application.class);
19 | private static final Boolean PRODUCTION_MODE = true;
20 |
21 | private static SecurityHandler basicAuth(String username, String password, String realm) {
22 |
23 | HashLoginService l = new HashLoginService();
24 | l.putUser(username, Credential.getCredential(password), new String[]{"editor"});
25 | l.setName(realm);
26 |
27 | Constraint constraint = new Constraint();
28 | constraint.setName(Constraint.__BASIC_AUTH);
29 | constraint.setRoles(new String[]{"editor"});
30 | constraint.setAuthenticate(true);
31 |
32 | ConstraintMapping cm = new ConstraintMapping();
33 | cm.setConstraint(constraint);
34 | cm.setPathSpec("/*");
35 |
36 | ConstraintSecurityHandler csh = new ConstraintSecurityHandler();
37 | csh.setAuthenticator(new BasicAuthenticator());
38 | csh.setRealmName("cf");
39 | csh.addConstraintMapping(cm);
40 | csh.setLoginService(l);
41 |
42 | return csh;
43 |
44 | }
45 |
46 | public static void main(String[] args) {
47 | int port = Configuration.INSTANCE.getInt("port", 8080);
48 | Server server = new Server(port);
49 | ServletContextHandler contextHandler
50 | = new ServletContextHandler(ServletContextHandler.SESSIONS);
51 |
52 | boolean authEnabled = Boolean.parseBoolean(Configuration.INSTANCE.getString("basicAuth", "false"));
53 |
54 | if(authEnabled) {
55 | contextHandler.setSecurityHandler(basicAuth(System.getenv("UI_USER"), System.getenv("UI_PASSWORD"), "editor"));
56 | }
57 | contextHandler.setContextPath("/");
58 | ServletHolder sh = new ServletHolder(new VaadinServlet());
59 | contextHandler.addServlet(sh, "/*");
60 | contextHandler.setInitParameter("ui", CrawlerAdminUI.class.getCanonicalName());
61 | contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE));
62 | server.setHandler(contextHandler);
63 |
64 |
65 | try {
66 | server.start();
67 | server.join();
68 | } catch (Exception e) {
69 | LOG.error("Failed to start application", e);
70 | }
71 | }
72 | }
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/CrawlerAdminUI.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui;
2 |
3 | import com.vaadin.annotations.Theme;
4 | import com.vaadin.annotations.VaadinServletConfiguration;
5 | import com.vaadin.server.VaadinRequest;
6 | import com.vaadin.server.VaadinServlet;
7 | import com.vaadin.ui.UI;
8 | import lt.tokenmill.crawling.adminui.view.HttpSourcesView;
9 |
10 | import javax.servlet.annotation.WebServlet;
11 |
12 | @Theme("crawleradmintheme")
13 | public class CrawlerAdminUI extends UI {
14 |
15 | @Override
16 | protected void init(VaadinRequest vaadinRequest) {
17 | setContent(new HttpSourcesView());
18 | }
19 |
20 | @WebServlet(urlPatterns = "/*", name = "CrawlerAdminUIServlet", asyncSupported = true)
21 | @VaadinServletConfiguration(ui = CrawlerAdminUI.class, productionMode = false)
22 | public static class CrawlerAdminUIServlet extends VaadinServlet {
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/HttpSourceTestsCache.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui;
2 |
3 | import com.google.common.base.Strings;
4 | import com.google.common.cache.Cache;
5 | import com.google.common.cache.CacheBuilder;
6 |
7 | import java.util.concurrent.TimeUnit;
8 |
9 | public class HttpSourceTestsCache {
10 |
11 | private static final Cache CACHE = CacheBuilder
12 | .newBuilder()
13 | .maximumSize(1000)
14 | .expireAfterWrite(5, TimeUnit.DAYS)
15 | .build();
16 |
17 | public static HttpSourceTest get(String sourceUrl) {
18 | HttpSourceTest test = CACHE.getIfPresent(sourceUrl.toLowerCase());
19 | return test != null ? test : new HttpSourceTest("", "");
20 | }
21 |
22 | public static void put(String sourceUrl, String url, String html) {
23 | CACHE.put(sourceUrl.toLowerCase(),
24 | new HttpSourceTest(Strings.nullToEmpty(url), Strings.nullToEmpty(html)));
25 | }
26 |
27 | public static class HttpSourceTest {
28 |
29 | private String url;
30 | private String html;
31 |
32 | public HttpSourceTest(String url, String html) {
33 | this.url = url;
34 | this.html = html;
35 | }
36 |
37 | public String getUrl() {
38 | return url;
39 | }
40 |
41 | public void setUrl(String url) {
42 | this.url = url;
43 | }
44 |
45 | public String getHtml() {
46 | return html;
47 | }
48 |
49 | public void setHtml(String html) {
50 | this.html = html;
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/CSVUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import com.google.common.collect.Maps;
4 | import com.opencsv.CSVReader;
5 | import com.opencsv.CSVWriter;
6 |
7 | import java.io.Reader;
8 | import java.io.StringReader;
9 | import java.io.Writer;
10 | import java.util.Map;
11 |
12 | public class CSVUtils {
13 |
14 | private static final char DEFAULT_SEPARATOR = ',';
15 | private static final char DEFAULT_QUOTE = '\"';
16 | private static final char DEFAULT_ESCAPE = '\\';
17 |
18 | public static CSVWriter createDefaultWriter(Writer writer) {
19 | return new CSVWriter(writer, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE);
20 | }
21 |
22 | public static CSVReader createDefaultReader(Reader reader) {
23 | return new CSVReader(reader, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE);
24 | }
25 |
26 | public static CSVReader createDefaultReader(String csv) {
27 | return createDefaultReader(new StringReader(csv));
28 | }
29 |
30 | public static Map resolveColumnIndexes(String[] columns, String[] headers) {
31 | Map result = Maps.newHashMap();
32 | for (String column : columns) {
33 | for (int i = 0; i < headers.length; i++) {
34 | if (headers[i].equalsIgnoreCase(column)) {
35 | result.put(column, i);
36 | }
37 | }
38 | }
39 | return result;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/GridUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import com.google.common.base.Joiner;
4 | import com.vaadin.data.Item;
5 | import com.vaadin.data.util.PropertyValueGenerator;
6 | import com.vaadin.data.util.converter.Converter;
7 |
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import java.util.Locale;
11 |
12 | public class GridUtils {
13 |
14 | public static class StringListConverter implements Converter {
15 | @Override
16 | public List convertToModel(String s, Class extends List> aClass, Locale locale) throws ConversionException {
17 | return new ArrayList();
18 | }
19 |
20 | @Override
21 | public String convertToPresentation(List list, Class extends String> aClass, Locale locale) throws ConversionException {
22 | return Joiner.on(", ").join(list);
23 | }
24 |
25 | @Override
26 | public Class getModelType() {
27 | return List.class;
28 | }
29 |
30 | @Override
31 | public Class getPresentationType() {
32 | return String.class;
33 | }
34 | }
35 |
36 | public static class UrlToLinkConverter implements Converter {
37 |
38 | @Override
39 | public String convertToModel(String string, Class extends String> aClass, Locale locale) throws ConversionException {
40 | return string;
41 | }
42 |
43 | @Override
44 | public String convertToPresentation(String string, Class extends String> aClass, Locale locale) throws ConversionException {
45 | return String.format("%s", string, string);
46 | }
47 |
48 | @Override
49 | public Class getModelType() {
50 | return String.class;
51 | }
52 |
53 | @Override
54 | public Class getPresentationType() {
55 | return String.class;
56 | }
57 | }
58 |
59 | public static class ButtonPropertyGenerator extends PropertyValueGenerator {
60 |
61 |
62 | private String name;
63 |
64 | public ButtonPropertyGenerator(String name) {
65 | this.name = name;
66 | }
67 |
68 | @Override
69 | public String getValue(Item item, Object itemId, Object propertyId) {
70 | return name;
71 | }
72 |
73 | @Override
74 | public Class getType() {
75 | return String.class;
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceCSVUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import com.google.common.base.Strings;
4 | import lt.tokenmill.crawling.data.DataUtils;
5 | import lt.tokenmill.crawling.data.HttpSource;
6 | import lt.tokenmill.crawling.es.Utils;
7 |
8 | import java.util.Map;
9 | import java.util.Objects;
10 |
11 | public class HttpSourceCSVUtils {
12 |
13 | public static final String[] CSV_COLUMNS = new String[]{
14 | "url", "name", "language", "timezone", "enabled",
15 | "discovery_enabled", "url_crawl_delay_secs", "feed_crawl_delay_secs",
16 | "sitemap_crawl_delay_secs", "urls", "feeds", "sitemaps",
17 | "categories", "app_ids",
18 | "url_filters", "url_normalizers", "title_selectors",
19 | "text_selectors", "text_normalizers",
20 | "date_selectors", "date_regexps", "date_formats"};
21 |
22 | public static String[] mapHttpSourceToCsvRow(HttpSource ld) {
23 | return new String[]{
24 | ld.getUrl(), ld.getName(), ld.getLanguage(), ld.getTimezone(),
25 | String.valueOf(ld.isEnabled()), String.valueOf(ld.isDiscoveryEnabled()),
26 | Objects.toString(ld.getUrlRecrawlDelayInSecs(), ""),
27 | Objects.toString(ld.getFeedRecrawlDelayInSecs(), ""),
28 | Objects.toString(ld.getSitemapRecrawlDelayInSecs(), ""),
29 | Utils.listToText(ld.getUrls()), Utils.listToText(ld.getFeeds()), Utils.listToText(ld.getSitemaps()),
30 | Utils.listToText(ld.getCategories()), Utils.listToText(ld.getAppIds()),
31 | Utils.listToText(ld.getUrlFilters()), Utils.listToText(ld.getUrlNormalizers()),
32 | Utils.listToText(ld.getTitleSelectors()),
33 | Utils.listToText(ld.getTextSelectors()), Utils.listToText(ld.getTextNormalizers()),
34 | Utils.listToText(ld.getDateSelectors()), Utils.listToText(ld.getDateRegexps()),
35 | Utils.listToText(ld.getDateFormats())
36 | };
37 | }
38 |
39 | public static HttpSource mapCsvRowToHttpSource(String[] row, Map columnIndexes) {
40 | HttpSource hs = new HttpSource();
41 | hs.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")]));
42 | hs.setName(Strings.emptyToNull(row[columnIndexes.get("name")]));
43 | hs.setLanguage(Strings.emptyToNull(row[columnIndexes.get("language")]));
44 | hs.setTimezone(Strings.emptyToNull(row[columnIndexes.get("timezone")]));
45 | hs.setEnabled(Boolean.parseBoolean(row[columnIndexes.get("enabled")]));
46 | hs.setDiscoveryEnabled(Boolean.parseBoolean(row[columnIndexes.get("discovery_enabled")]));
47 | hs.setUrlRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("url_crawl_delay_secs")]));
48 | hs.setFeedRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("feed_crawl_delay_secs")]));
49 | hs.setSitemapRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("sitemap_crawl_delay_secs")]));
50 | hs.setUrls(DataUtils.parseStringList(row[columnIndexes.get("urls")]));
51 | hs.setFeeds(DataUtils.parseStringList(row[columnIndexes.get("feeds")]));
52 | hs.setSitemaps(DataUtils.parseStringList(row[columnIndexes.get("sitemaps")]));
53 | hs.setCategories(DataUtils.parseStringList(row[columnIndexes.get("categories")]));
54 | hs.setAppIds(DataUtils.parseStringList(row[columnIndexes.get("app_ids")]));
55 | hs.setUrlFilters(DataUtils.parseStringList(row[columnIndexes.get("url_filters")]));
56 | hs.setUrlNormalizers(DataUtils.parseStringList(row[columnIndexes.get("url_normalizers")]));
57 | hs.setTitleSelectors(DataUtils.parseStringList(row[columnIndexes.get("title_selectors")]));
58 | hs.setTextSelectors(DataUtils.parseStringList(row[columnIndexes.get("text_selectors")]));
59 | hs.setTextNormalizers(DataUtils.parseStringList(row[columnIndexes.get("text_normalizers")]));
60 | hs.setDateSelectors(DataUtils.parseStringList(row[columnIndexes.get("date_selectors")]));
61 | hs.setDateRegexps(DataUtils.parseStringList(row[columnIndexes.get("date_regexps")]));
62 | hs.setDateFormats(DataUtils.parseStringList(row[columnIndexes.get("date_formats")]));
63 | return hs;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import com.google.common.base.Charsets;
4 | import com.google.common.base.Strings;
5 | import com.google.common.io.BaseEncoding;
6 | import lt.tokenmill.crawling.data.HttpSourceTest;
7 |
8 | import java.util.Map;
9 | import java.util.Objects;
10 |
11 | public class HttpSourceTestCSVUtils {
12 |
13 | public static final String[] CSV_COLUMNS = new String[]{
14 | "url", "source", "html", "url_accepted", "title", "text", "date"};
15 |
16 | public static String[] mapHttpSourceTestToCsvRow(HttpSourceTest httpSourceTest) {
17 | return new String[]{
18 | httpSourceTest.getUrl(), httpSourceTest.getSource(),
19 | BaseEncoding.base64().encode(httpSourceTest.getHtml().getBytes(Charsets.UTF_8)),
20 | Objects.toString(httpSourceTest.getUrlAccepted(), "false"),
21 | Strings.nullToEmpty(httpSourceTest.getTitle()),
22 | Strings.nullToEmpty(httpSourceTest.getText()),
23 | Strings.nullToEmpty(httpSourceTest.getDate())
24 | };
25 | }
26 |
27 | public static HttpSourceTest mapCsvRowToHttpSourceTest(String[] row, Map columnIndexes) {
28 | HttpSourceTest hst = new HttpSourceTest();
29 | hst.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")]));
30 | hst.setSource(Strings.emptyToNull(row[columnIndexes.get("source")]));
31 | hst.setHtml(new String(BaseEncoding.base64().decode(row[columnIndexes.get("html")]), Charsets.UTF_8));
32 | hst.setUrlAccepted(Boolean.parseBoolean(row[columnIndexes.get("url_accepted")]));
33 | hst.setTitle(Strings.emptyToNull(row[columnIndexes.get("title")]));
34 | hst.setText(Strings.emptyToNull(row[columnIndexes.get("text")]));
35 | hst.setDate(Strings.emptyToNull(row[columnIndexes.get("date")]));
36 | return hst;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/BaseView.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view;
2 |
3 |
4 | import com.vaadin.ui.HorizontalLayout;
5 | import com.vaadin.ui.MenuBar;
6 | import com.vaadin.ui.UI;
7 | import com.vaadin.ui.VerticalLayout;
8 | import lt.tokenmill.crawling.adminui.view.namedquery.NamedQueriesView;
9 | import lt.tokenmill.crawling.adminui.view.pageanalysis.PageAnalysisView;
10 | import lt.tokenmill.crawling.adminui.view.sourcetest.HttpSourceTestsView;
11 |
12 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
13 |
14 | public class BaseView extends VerticalLayout {
15 |
16 | public BaseView(String title) {
17 | UI.getCurrent().getPage().setTitle(String.format("Crawler Admin | %s", title));
18 | setWidth(100, PERCENTAGE);
19 | setSpacing(true);
20 | setMargin(true);
21 |
22 | HorizontalLayout actionBarLayout = new HorizontalLayout();
23 | actionBarLayout.setWidth(100, PERCENTAGE);
24 |
25 | MenuBar menu = new MenuBar();
26 |
27 | MenuBar.MenuItem dataItem = menu.addItem("Configuration", null);
28 | dataItem.addItem("HTTP Sources", (item) -> UI.getCurrent().setContent(new HttpSourcesView()));
29 | dataItem.addItem("HTTP Source Tests", (item) -> UI.getCurrent().setContent(new HttpSourceTestsView()));
30 | dataItem.addItem("Named Queries", (item) -> UI.getCurrent().setContent(new NamedQueriesView()));
31 | dataItem.addItem("Import / Export", (item) -> UI.getCurrent().setContent(new ImportExportView()));
32 |
33 | menu.addItem("Page Analysis", (item) -> UI.getCurrent().setContent(new PageAnalysisView()));
34 |
35 | actionBarLayout.addComponent(menu);
36 |
37 | addComponent(actionBarLayout);
38 | }
39 |
40 | }
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/HttpSourceStatsWindow.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view;
2 |
3 |
4 | import com.byteowls.vaadin.chartjs.ChartJs;
5 | import com.byteowls.vaadin.chartjs.config.BarChartConfig;
6 | import com.byteowls.vaadin.chartjs.data.BarDataset;
7 | import com.byteowls.vaadin.chartjs.data.Dataset;
8 | import com.byteowls.vaadin.chartjs.data.LineDataset;
9 | import com.byteowls.vaadin.chartjs.options.Position;
10 | import com.vaadin.ui.Component;
11 | import com.vaadin.ui.Window;
12 | import lt.tokenmill.crawling.commonui.ElasticSearch;
13 | import lt.tokenmill.crawling.es.model.DateHistogramValue;
14 |
15 | import java.util.List;
16 | import java.util.stream.Collectors;
17 |
18 | public class HttpSourceStatsWindow extends Window {
19 |
20 | public HttpSourceStatsWindow(String sourceUrl) {
21 | setModal(true);
22 | center();
23 | setCaption(String.format("%s crawling statistics", sourceUrl));
24 | setWidth(50, Unit.PERCENTAGE);
25 | setHeight(50, Unit.PERCENTAGE);
26 | List urls = ElasticSearch.getUrlOperations().calculateStats(sourceUrl);
27 | List documents = ElasticSearch.getDocumentOperations().calculateStats(sourceUrl);
28 | Component layout = getChart(sourceUrl, urls, documents);
29 | layout.setWidth(100, Unit.PERCENTAGE);
30 | setContent(layout);
31 | }
32 |
33 | public Component getChart(String sourceUrl, List urls, List documents) {
34 | BarChartConfig config = new BarChartConfig();
35 |
36 | BarDataset docsDataset = new BarDataset().type().label("Fetched Documents")
37 | .borderColor("rgb(54, 162, 235)")
38 | .backgroundColor("rgb(54, 162, 235)")
39 | .borderWidth(2);
40 | documents.forEach(d -> docsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue())));
41 |
42 | LineDataset urlsDataset = new LineDataset().type().label("Discovered Urls")
43 | .borderColor("rgb(75, 192, 192)")
44 | .backgroundColor("white")
45 | .borderWidth(2);
46 | urls.forEach(d -> urlsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue())));
47 |
48 | config.data()
49 | .labelsAsList(urls.stream().map(DateHistogramValue::getDate).collect(Collectors.toList()))
50 | .addDataset(docsDataset)
51 | .addDataset(urlsDataset)
52 | .and();
53 |
54 | config.options()
55 | .responsive(true)
56 | .title()
57 | .display(true)
58 | .position(Position.LEFT)
59 | .and()
60 | .done();
61 |
62 | ChartJs chart = new ChartJs(config);
63 | chart.setJsLoggingEnabled(true);
64 | return chart;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/ImportExportView.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view;
2 |
3 | import com.vaadin.ui.TabSheet;
4 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceImportExport;
5 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceTestImportExport;
6 | import lt.tokenmill.crawling.adminui.view.imports.NamedQueryImportExport;
7 |
8 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
9 |
10 | public class ImportExportView extends BaseView {
11 |
12 | public ImportExportView() {
13 | super("Import / Export");
14 | TabSheet mainLayout = new TabSheet();
15 | mainLayout.setWidth(100, PERCENTAGE);
16 | mainLayout.addTab(new HttpSourceImportExport(), "HTTP Sources");
17 | mainLayout.addTab(new HttpSourceTestImportExport(), "HTTP Source Tests");
18 | mainLayout.addTab(new NamedQueryImportExport(), "Named Queries");
19 | addComponent(mainLayout);
20 | }
21 |
22 |
23 |
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueriesView.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view.namedquery;
2 |
3 | import com.vaadin.data.util.BeanItemContainer;
4 | import com.vaadin.data.util.GeneratedPropertyContainer;
5 | import com.vaadin.ui.*;
6 | import lt.tokenmill.crawling.adminui.view.BaseView;
7 | import lt.tokenmill.crawling.commonui.ElasticSearch;
8 | import lt.tokenmill.crawling.data.NamedQuery;
9 | import lt.tokenmill.crawling.data.PageableList;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
14 | import static com.vaadin.server.Sizeable.Unit.PIXELS;
15 |
16 | public class NamedQueriesView extends BaseView {
17 |
18 | private static final Logger LOG = LoggerFactory.getLogger(NamedQueriesView.class);
19 |
20 | private Grid itemsGrid = new Grid(new GeneratedPropertyContainer(new BeanItemContainer<>(NamedQuery.class)));
21 | private Label totalCountLabel = new Label();
22 | private TextField filterField = new TextField();
23 |
24 | public NamedQueriesView() {
25 | super("Named Queries");
26 | HorizontalLayout mainLayout = new HorizontalLayout();
27 | mainLayout.setWidth(100, PERCENTAGE);
28 | mainLayout.setHeight(100, PERCENTAGE);
29 | mainLayout.setSpacing(true);
30 |
31 | VerticalLayout gridLayout = new VerticalLayout();
32 | gridLayout.setSpacing(true);
33 | gridLayout.setWidth(100, PERCENTAGE);
34 |
35 |
36 | // Search field and create new button
37 | filterField.setInputPrompt("Enter Name...");
38 | filterField.addTextChangeListener(event -> refreshGrid(event.getText()));
39 |
40 | Button addNewButton = new Button("Add New Query");
41 | addNewButton.addClickListener(event -> showNamedQueryForm(new NamedQuery()));
42 |
43 | HorizontalLayout actionHeader = new HorizontalLayout(filterField, addNewButton);
44 | actionHeader.setSpacing(true);
45 | actionHeader.setWidth(100, PERCENTAGE);
46 | filterField.setWidth(100, PERCENTAGE);
47 | actionHeader.setExpandRatio(filterField, 1.0f);
48 | gridLayout.addComponent(actionHeader);
49 |
50 | // Grid
51 | itemsGrid.setWidth(100, PERCENTAGE);
52 | itemsGrid.setHeight(700, PIXELS);
53 | itemsGrid.setSelectionMode(Grid.SelectionMode.SINGLE);
54 | itemsGrid.addSelectionListener(
55 | e -> {
56 | NamedQuery nq = (NamedQuery) itemsGrid.getSelectedRow();
57 | if (nq != null) {
58 | nq = ElasticSearch.getNamedQueryOperations().get(nq.getName());
59 | showNamedQueryForm(nq);
60 | }
61 | });
62 | itemsGrid.setColumns("name");
63 | gridLayout.addComponent(itemsGrid);
64 | gridLayout.addComponent(totalCountLabel);
65 | refreshGrid(filterField.getValue());
66 | mainLayout.addComponent(gridLayout);
67 | mainLayout.setExpandRatio(gridLayout, 1f);
68 | addComponent(mainLayout);
69 | }
70 |
71 | private void refreshGrid(String text) {
72 | PageableList data = ElasticSearch.getNamedQueryOperations().filter(text);
73 | itemsGrid.getContainerDataSource().removeAllItems();
74 | for (NamedQuery nq : data.getItems()) {
75 | itemsGrid.getContainerDataSource().addItem(nq);
76 | }
77 | totalCountLabel.setValue(String.format("Total count: %d", data.getTotalCount()));
78 | LOG.info("Refreshed grid using filter '{}'. Total items: {}", text, data.getTotalCount());
79 | }
80 |
81 | private void showNamedQueryForm(NamedQuery nq) {
82 | NamedQueryFormWindow formWindow = new NamedQueryFormWindow(nq);
83 | formWindow.addAfterUpdateListener(() -> refreshGrid(filterField.getValue()));
84 | UI.getCurrent().addWindow(formWindow);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueryResultsPanel.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view.namedquery;
2 |
3 | import com.vaadin.shared.ui.label.ContentMode;
4 | import com.vaadin.ui.Label;
5 | import com.vaadin.ui.Panel;
6 | import com.vaadin.ui.VerticalLayout;
7 | import com.vaadin.ui.themes.ValoTheme;
8 | import lt.tokenmill.crawling.data.DataUtils;
9 | import lt.tokenmill.crawling.data.HttpArticle;
10 | import lt.tokenmill.crawling.data.PageableList;
11 |
12 | public class NamedQueryResultsPanel extends Panel {
13 |
14 | public NamedQueryResultsPanel(PageableList results) {
15 | VerticalLayout layout = new VerticalLayout();
16 | layout.setMargin(true);
17 |
18 | Label countLabel = new Label(String.format("%s documents matched", results.getTotalCount()));
19 | countLabel.addStyleName(ValoTheme.LABEL_LARGE);
20 | countLabel.setSizeFull();
21 | layout.addComponent(countLabel);
22 |
23 | for (HttpArticle article : results.getItems()) {
24 | String labelHtml = String.format("%s %s - %s",
25 | DataUtils.formatInUTC(article.getPublished()), article.getUrl(), article.getTitle(), article.getSource());
26 | Label articleLabel = new Label(labelHtml);
27 | articleLabel.setContentMode(ContentMode.HTML);
28 | articleLabel.setSizeFull();
29 | layout.addComponent(articleLabel);
30 | }
31 | setContent(layout);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/HttpSourceAllTestsWindow.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view.sourcetest;
2 |
3 | import com.google.common.collect.Lists;
4 | import com.vaadin.ui.*;
5 | import com.vaadin.ui.themes.ValoTheme;
6 | import lt.tokenmill.crawling.commonui.ElasticSearch;
7 | import lt.tokenmill.crawling.data.HttpSource;
8 | import lt.tokenmill.crawling.data.HttpSourceTest;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.util.List;
13 |
14 | public class HttpSourceAllTestsWindow extends Window {
15 |
16 | private static final Logger LOG = LoggerFactory.getLogger(HttpSourceAllTestsWindow.class);
17 |
18 | private List afterUpdateListeners = Lists.newArrayList();
19 |
20 | private Button cancelButton = new Button("Close", (event) -> this.close());
21 |
22 | public HttpSourceAllTestsWindow() {
23 | setCaption("All Tests");
24 | setModal(true);
25 | center();
26 | setWidth(80, Unit.PERCENTAGE);
27 | setHeight(80, Unit.PERCENTAGE);
28 |
29 | VerticalLayout mainLayout = new VerticalLayout();
30 | mainLayout.setMargin(true);
31 |
32 | List tests = ElasticSearch.getHttpSourceTestOperations().all();
33 | for (HttpSourceTest test : tests) {
34 | HttpSource source = ElasticSearch.getHttpSourceOperations().get(test.getSource());
35 | if (source == null) {
36 | Label noSourceLabel = new Label(String.format("Source configuration '%s' not found", test.getSource()));
37 | noSourceLabel.addStyleName(ValoTheme.LABEL_FAILURE);
38 | noSourceLabel.setSizeFull();
39 | mainLayout.addComponent(noSourceLabel);
40 | } else {
41 | mainLayout.addComponent(new TestResultsPanel(source, test));
42 | }
43 | }
44 |
45 | HorizontalLayout actions = new HorizontalLayout(cancelButton);
46 | actions.setSpacing(true);
47 |
48 | setContent(mainLayout);
49 | }
50 | }
--------------------------------------------------------------------------------
/administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/TestResultsPanel.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.view.sourcetest;
2 |
3 | import com.vaadin.ui.*;
4 | import com.vaadin.ui.themes.ValoTheme;
5 | import lt.tokenmill.crawling.data.HttpSource;
6 | import lt.tokenmill.crawling.data.HttpSourceTest;
7 | import lt.tokenmill.crawling.parser.utils.HttpSourceTester;
8 |
9 | import java.util.Map;
10 |
11 | public class TestResultsPanel extends Panel {
12 |
13 | private Map difference;
14 |
15 | public TestResultsPanel(HttpSource source, HttpSourceTest test) {
16 | this.difference = HttpSourceTester.test(source, test);
17 | VerticalLayout layout = new VerticalLayout();
18 | layout.setMargin(true);
19 | if (this.difference.isEmpty()) {
20 | Label resultLabel = new Label(String.format("'%s' Test Passed", test.getUrl()));
21 | resultLabel.addStyleName(ValoTheme.LABEL_SUCCESS);
22 | resultLabel.setSizeFull();
23 | layout.addComponent(resultLabel);
24 | } else {
25 | Label resultLabel = new Label(String.format("'%s' Test Failed", test.getUrl()));
26 | resultLabel.addStyleName(ValoTheme.LABEL_FAILURE);
27 | resultLabel.setSizeFull();
28 | layout.addComponent(resultLabel);
29 | }
30 |
31 | for (Map.Entry diff : difference.entrySet()) {
32 | HorizontalLayout fieldLayout = new HorizontalLayout();
33 | fieldLayout.setSizeFull();
34 |
35 | Label resultLabel = new Label(diff.getKey());
36 | resultLabel.addStyleName(ValoTheme.LABEL_LARGE);
37 | fieldLayout.addComponent(resultLabel);
38 | fieldLayout.setComponentAlignment(resultLabel, Alignment.MIDDLE_CENTER);
39 | fieldLayout.setExpandRatio(resultLabel, 0.15f);
40 |
41 | FormLayout valuesLayout = new FormLayout();
42 | valuesLayout.setWidth(100, Unit.PERCENTAGE);
43 | valuesLayout.setSizeFull();
44 |
45 | TextArea expected = new TextArea("Expected");
46 | expected.setSizeFull();
47 | expected.setRows(2);
48 | expected.setValue(diff.getValue().getExpected());
49 | expected.setReadOnly(true);
50 |
51 | TextArea actual = new TextArea("Actual");
52 | actual.setSizeFull();
53 | actual.setRows(2);
54 | actual.setValue(diff.getValue().getActual());
55 | actual.setReadOnly(true);
56 |
57 | valuesLayout.addComponents(expected, actual);
58 |
59 | fieldLayout.addComponent(valuesLayout);
60 | fieldLayout.setExpandRatio(valuesLayout, 0.85f);
61 |
62 | layout.addComponent(fieldLayout);
63 | }
64 |
65 | setContent(layout);
66 | }
67 |
68 | public boolean passed() {
69 | return difference != null && difference.isEmpty();
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/administration-ui/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=DEBUG, stdout
2 |
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Target=System.out
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n
7 |
8 |
9 | log4j.logger.org.apache=INFO
10 | log4j.logger.org.eclipse.jetty=INFO
11 | log4j.logger.org.elasticsearch=INFO
--------------------------------------------------------------------------------
/administration-ui/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/crawling-framework/987100fee5965b43e178c9096ab3b2aa3a11fac7/administration-ui/src/main/resources/log4j2.properties
--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/addons.scss:
--------------------------------------------------------------------------------
1 | /* This file is automatically managed and will be overwritten from time to time. */
2 | /* Do not manually edit this file. */
3 |
4 | /* Import and include this mixin into your project theme to include the addon themes */
5 | @mixin addons {
6 | }
7 |
8 |
--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/crawleradmintheme.scss:
--------------------------------------------------------------------------------
1 | // If you edit this file you need to compile the theme. See README.md for details.
2 | // Global variable overrides. Must be declared before importing Valo.
3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing.
4 |
5 | //$v-font-size: 16px;
6 | //$v-font-weight: 300;
7 | //$v-font-family: "Open Sans", sans-serif;
8 |
9 | // Defines the border used by all components.
10 | //$v-border: 1px solid (v-shade 0.7);
11 | //$v-border-radius: 4px;
12 |
13 | // Affects the color of some component elements, e.g Button, Panel title, etc
14 | //$v-background-color: hsl(210, 0%, 98%);
15 |
16 | // Affects the color of content areas, e.g Panel and Window content, TextField input etc
17 | //$v-app-background-color: $v-background-color;
18 |
19 | // Affects the visual appearance of all components
20 | //$v-gradient: v-linear 8%;
21 | //$v-bevel-depth: 30%;
22 | //$v-shadow-opacity: 5%;
23 |
24 | // Defines colors for indicating status (focus, success, failure)
25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically
26 | //$v-friendly-color: #2c9720;
27 | //$v-error-indicator-color: #ed473b;
28 |
29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html
30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples
31 |
32 | @import "../valo/valo.scss";
33 |
34 | @mixin crawleradmintheme {
35 | @include valo;
36 |
37 | // Insert your own theme rules here
38 | }
--------------------------------------------------------------------------------
/administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/styles.scss:
--------------------------------------------------------------------------------
1 | @import "crawleradmintheme.scss";
2 | @import "addons.scss";
3 |
4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes.
5 | // The actual styles should be defined in crawleradmintheme.scss
6 |
7 | .crawleradmintheme {
8 | @include addons;
9 | @include crawleradmintheme;
10 | }
--------------------------------------------------------------------------------
/administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtilsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import com.google.common.base.Charsets;
4 | import com.google.common.io.Resources;
5 | import lt.tokenmill.crawling.data.HttpSourceTest;
6 | import org.junit.Test;
7 |
8 | import java.net.URL;
9 | import java.time.Instant;
10 | import java.util.Map;
11 |
12 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceTestCSVUtils.CSV_COLUMNS;
13 | import static org.junit.Assert.assertEquals;
14 |
15 | public class HttpSourceTestCSVUtilsTest {
16 |
17 | protected String loadHtml(String name) throws Exception {
18 | URL htmlResource = Resources.getResource(name + ".html");
19 | return Resources.toString(htmlResource, Charsets.UTF_8);
20 | }
21 |
22 | @Test
23 | public void testHttpSourceTestToCsvAndBack() throws Exception {
24 | HttpSourceTest httpSourceTest = new HttpSourceTest();
25 | httpSourceTest.setUrl("http://www.tokenmill.lt/");
26 | httpSourceTest.setSource("http://www.tokenmill.lt/");
27 | httpSourceTest.setHtml(loadHtml("www.tokenmill.lt"));
28 | httpSourceTest.setUrlAccepted(true);
29 | httpSourceTest.setTitle("TokenMill");
30 | httpSourceTest.setText("Some text");
31 | httpSourceTest.setDate(Instant.now().toString());
32 |
33 | String[] csvRow = HttpSourceTestCSVUtils.mapHttpSourceTestToCsvRow(httpSourceTest);
34 | String[] headerLine = CSV_COLUMNS;
35 | Map columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS);
36 | HttpSourceTest fromRow = HttpSourceTestCSVUtils.mapCsvRowToHttpSourceTest(csvRow, columnIndexes);
37 | assertEquals(httpSourceTest.getUrl(), fromRow.getUrl());
38 | assertEquals(httpSourceTest.getSource(), fromRow.getSource());
39 | assertEquals(httpSourceTest.getHtml(), fromRow.getHtml());
40 | assertEquals(httpSourceTest.getUrlAccepted(), fromRow.getUrlAccepted());
41 | assertEquals(httpSourceTest.getTitle(), fromRow.getTitle());
42 | assertEquals(httpSourceTest.getText(), fromRow.getText());
43 | assertEquals(httpSourceTest.getDate(), fromRow.getDate());
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourcesCSVUtilsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.adminui.utils;
2 |
3 | import lt.tokenmill.crawling.data.HttpSource;
4 | import org.junit.Test;
5 |
6 | import java.util.Arrays;
7 | import java.util.Map;
8 |
9 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceCSVUtils.CSV_COLUMNS;
10 | import static org.junit.Assert.assertEquals;
11 |
12 | public class HttpSourcesCSVUtilsTest {
13 |
14 | @Test
15 | public void testHttpSourcesToCsvAndBack() {
16 | HttpSource source = new HttpSource();
17 | source.setUrl("url");
18 | source.setName("name");
19 | source.setLanguage("language");
20 | source.setTimezone("timezone");
21 | source.setEnabled(true);
22 | source.setDiscoveryEnabled(true);
23 | source.setUrlRecrawlDelayInSecs(1);
24 | source.setFeedRecrawlDelayInSecs(1);
25 | source.setSitemapRecrawlDelayInSecs(1);
26 | source.setUrls(Arrays.asList("url1", "url2"));
27 | source.setFeeds(Arrays.asList("feed1", "feed2"));
28 | source.setSitemaps(Arrays.asList("sitemap1", "sitemap2"));
29 | source.setCategories(Arrays.asList("cat1", "cat2"));
30 | source.setAppIds(Arrays.asList("app1", "app2"));
31 | source.setUrlFilters(Arrays.asList("f1", "f2"));
32 | source.setUrlNormalizers(Arrays.asList("n1", "n2"));
33 | source.setTitleSelectors(Arrays.asList("ts1", "ts2"));
34 | source.setTextSelectors(Arrays.asList("ts1", "ts2"));
35 | source.setTextNormalizers(Arrays.asList("tn1", "tn2"));
36 | source.setDateSelectors(Arrays.asList("ds1", "ds2"));
37 | source.setDateRegexps(Arrays.asList("dr1", "dr2"));
38 | source.setDateFormats(Arrays.asList("df1", "df2"));
39 |
40 | String[] row = HttpSourceCSVUtils.mapHttpSourceToCsvRow(source);
41 | String[] headerLine = CSV_COLUMNS;
42 | Map columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS);
43 | HttpSource fromRow = HttpSourceCSVUtils.mapCsvRowToHttpSource(row, columnIndexes);
44 | assertEquals(source, fromRow);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/analysis-ui/conf/development.properties:
--------------------------------------------------------------------------------
1 | port=8080
2 | es.hostname=localhost
3 | es.transport.port=9300
4 | es.httpsource.index.name=http_sources
5 | es.httpsource.doc.type=http_source
6 | es.httpsourcetest.index.name=http_source_tests
7 | es.httpsourcetest.doc.type=http_source_test
8 | es.namedqueries.index.name=named_queries
9 | es.namedqueries.doc.type=named_query
10 | es.docs.index.name=docs
11 | es.docs.doc.type=doc
--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/AnalysisUI.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.analysisui;
2 |
3 | import com.vaadin.annotations.Theme;
4 | import com.vaadin.annotations.VaadinServletConfiguration;
5 | import com.vaadin.server.VaadinRequest;
6 | import com.vaadin.server.VaadinServlet;
7 | import com.vaadin.ui.UI;
8 | import lt.tokenmill.crawling.analysisui.view.SearchView;
9 |
10 | import javax.servlet.annotation.WebServlet;
11 |
12 | @Theme("analysistheme")
13 | public class AnalysisUI extends UI {
14 |
15 | @Override
16 | protected void init(VaadinRequest vaadinRequest) {
17 | setContent(new SearchView());
18 | }
19 |
20 | @WebServlet(urlPatterns = "/*", name = "AnalysisUIServlet", asyncSupported = true)
21 | @VaadinServletConfiguration(ui = AnalysisUI.class, productionMode = false)
22 | public static class AnalysisUIServlet extends VaadinServlet {
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/Application.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.analysisui;
2 |
3 | import com.vaadin.server.VaadinServlet;
4 | import lt.tokenmill.crawling.commonui.Configuration;
5 | import org.eclipse.jetty.server.Server;
6 | import org.eclipse.jetty.servlet.ServletContextHandler;
7 | import org.eclipse.jetty.servlet.ServletHolder;
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 |
12 | public class Application {
13 |
14 | private static final Logger LOG = LoggerFactory.getLogger(Application.class);
15 | private static final Boolean PRODUCTION_MODE = true;
16 |
17 | public static void main(String[] args) {
18 | int port = Configuration.INSTANCE.getInt("port", 8080);
19 | Server server = new Server(port);
20 | ServletContextHandler contextHandler
21 | = new ServletContextHandler(ServletContextHandler.SESSIONS);
22 | contextHandler.setContextPath("/");
23 | ServletHolder sh = new ServletHolder(new VaadinServlet());
24 | contextHandler.addServlet(sh, "/*");
25 | contextHandler.setInitParameter("ui", AnalysisUI.class.getCanonicalName());
26 | contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE));
27 | server.setHandler(contextHandler);
28 | try {
29 | server.start();
30 | server.join();
31 | } catch (Exception e) {
32 | LOG.error("Failed to start application", e);
33 | }
34 | }
35 | }
--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/search/ResultPanel.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.analysisui.search;
2 |
3 | import com.vaadin.shared.ui.label.ContentMode;
4 | import com.vaadin.ui.Label;
5 | import com.vaadin.ui.Panel;
6 | import com.vaadin.ui.VerticalLayout;
7 | import lt.tokenmill.crawling.data.DataUtils;
8 | import lt.tokenmill.crawling.data.HighlightedSearchResult;
9 | import lt.tokenmill.crawling.data.HttpArticle;
10 |
11 | import java.util.stream.Collectors;
12 |
13 | public class ResultPanel extends Panel {
14 |
15 | private static final String RESULTS_TEMPLATE = "%s %s • %s
%s";
16 |
17 | public ResultPanel(HighlightedSearchResult searchResult) {
18 | HttpArticle article = searchResult.getArticle();
19 | String highlights = searchResult.getHighlights().stream().collect(Collectors.joining("
...
"));
20 | String text = String.format(RESULTS_TEMPLATE,
21 | DataUtils.formatInUTC(article.getPublished()).replace("T", " "),
22 | article.getUrl(), article.getTitle(), article.getSource(), highlights);
23 | Label content = new Label(text);
24 | content.setContentMode(ContentMode.HTML);
25 | VerticalLayout component = new VerticalLayout(content);
26 | component.setMargin(true);
27 | setContent(component);
28 | }
29 |
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/BaseView.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.analysisui.view;
2 |
3 |
4 | import com.vaadin.ui.HorizontalLayout;
5 | import com.vaadin.ui.MenuBar;
6 | import com.vaadin.ui.UI;
7 | import com.vaadin.ui.VerticalLayout;
8 |
9 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
10 |
11 | public class BaseView extends VerticalLayout {
12 |
13 | public BaseView(String title) {
14 | UI.getCurrent().getPage().setTitle(String.format("Analysis | %s", title));
15 | setWidth(100, PERCENTAGE);
16 | setSpacing(true);
17 | setMargin(true);
18 |
19 | HorizontalLayout actionBarLayout = new HorizontalLayout();
20 | actionBarLayout.setWidth(100, PERCENTAGE);
21 |
22 | MenuBar menu = new MenuBar();
23 |
24 | menu.addItem("Search", (item) -> UI.getCurrent().setContent(new SearchView()));
25 | menu.addItem("Context Cloud", (item) -> UI.getCurrent().setContent(new ContextCloudView()));
26 |
27 | actionBarLayout.addComponent(menu);
28 |
29 | addComponent(actionBarLayout);
30 | }
31 |
32 | }
--------------------------------------------------------------------------------
/analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/SearchView.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.analysisui.view;
2 |
3 | import com.google.common.collect.Lists;
4 | import com.vaadin.ui.*;
5 | import lt.tokenmill.crawling.analysisui.search.ResultPanel;
6 | import lt.tokenmill.crawling.commonui.ElasticSearch;
7 | import lt.tokenmill.crawling.data.HighlightedSearchResult;
8 | import lt.tokenmill.crawling.data.NamedQuery;
9 | import lt.tokenmill.crawling.data.PageableList;
10 | import lt.tokenmill.crawling.parser.utils.QueryParser;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | import java.util.List;
15 | import java.util.stream.Collectors;
16 |
17 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE;
18 |
19 | public class SearchView extends BaseView {
20 |
21 | private static final Logger LOG = LoggerFactory.getLogger(SearchView.class);
22 |
23 | private TextField filterField = new TextField();
24 | private Label queryDescriptionLabel = new Label();
25 | private VerticalLayout resultLayout = new VerticalLayout();
26 |
27 | public SearchView() {
28 | super("Search");
29 | Button searchButton = new Button("Search");
30 | searchButton.addClickListener(event -> search());
31 |
32 | VerticalLayout searchLayout = new VerticalLayout();
33 | searchLayout.setSpacing(true);
34 | searchLayout.setWidth(50, PERCENTAGE);
35 |
36 | HorizontalLayout actionHeader = new HorizontalLayout(filterField, searchButton);
37 | actionHeader.setSpacing(true);
38 | actionHeader.setWidth(100, PERCENTAGE);
39 | actionHeader.setExpandRatio(filterField, 1.0f);
40 | filterField.setWidth(100, PERCENTAGE);
41 |
42 | searchLayout.addComponent(actionHeader);
43 | searchLayout.addComponent(queryDescriptionLabel);
44 |
45 | addComponent(searchLayout);
46 | setComponentAlignment(searchLayout, Alignment.TOP_CENTER);
47 |
48 | resultLayout.setWidth(80, PERCENTAGE);
49 | resultLayout.setSpacing(true);
50 |
51 | addComponent(resultLayout);
52 | setComponentAlignment(resultLayout, Alignment.TOP_CENTER);
53 |
54 | }
55 |
56 | private void search() {
57 | resultLayout.removeAllComponents();
58 | List query = QueryParser.parseQuery(filterField.getValue());
59 | LOG.info("Parsed '{}' from query '{}'", query, filterField.getValue());
60 | List includedNamed = Lists.newArrayList();
61 | List excludedNamed = Lists.newArrayList();
62 | StringBuilder additionalQuery = new StringBuilder();
63 | for (String q : query) {
64 | boolean excluded = q.startsWith("-");
65 | String name = q.replaceAll("^[+-]+", "");
66 | NamedQuery namedQuery = ElasticSearch.getNamedQueryOperations().get(name);
67 | if (namedQuery != null && excluded) {
68 | excludedNamed.add(namedQuery);
69 | LOG.info("Named query '{}' is negative", namedQuery.getName());
70 | } else if (namedQuery != null) {
71 | includedNamed.add(namedQuery);
72 | LOG.info("Named query '{}' is positive", namedQuery.getName());
73 | } else {
74 | additionalQuery.append(" ").append(q);
75 | }
76 | }
77 | LOG.info("Additional query: '{}'", additionalQuery.toString().trim());
78 | PageableList result = ElasticSearch.getDocumentOperations().query(includedNamed, excludedNamed, additionalQuery.toString().trim());
79 | List namedQueries = Lists.newArrayList(includedNamed);
80 | namedQueries.addAll(excludedNamed);
81 |
82 | queryDescriptionLabel.setValue(String.format("Named Queries: %s, Additional Query: '%s'",
83 | namedQueries.stream().map(NamedQuery::getName).collect(Collectors.joining("', '", "'", "'")),
84 | additionalQuery.toString().trim()));
85 |
86 | for (HighlightedSearchResult r : result.getItems()) {
87 | resultLayout.addComponent(new ResultPanel(r));
88 | }
89 |
90 |
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/analysis-ui/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=DEBUG, stdout
2 |
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Target=System.out
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n
7 |
8 |
9 | log4j.logger.org.apache=INFO
10 | log4j.logger.org.eclipse.jetty=INFO
11 | log4j.logger.org.elasticsearch=INFO
--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/addons.scss:
--------------------------------------------------------------------------------
1 | /* This file is automatically managed and will be overwritten from time to time. */
2 | /* Do not manually edit this file. */
3 |
4 | /* Import and include this mixin into your project theme to include the addon themes */
5 | @mixin addons {
6 | }
7 |
8 |
--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/analysistheme.scss:
--------------------------------------------------------------------------------
1 | // If you edit this file you need to compile the theme. See README.md for details.
2 | // Global variable overrides. Must be declared before importing Valo.
3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing.
4 |
5 | //$v-font-size: 16px;
6 | //$v-font-weight: 300;
7 | //$v-font-family: "Open Sans", sans-serif;
8 |
9 | // Defines the border used by all components.
10 | //$v-border: 1px solid (v-shade 0.7);
11 | //$v-border-radius: 4px;
12 |
13 | // Affects the color of some component elements, e.g Button, Panel title, etc
14 | //$v-background-color: hsl(210, 0%, 98%);
15 |
16 | // Affects the color of content areas, e.g Panel and Window content, TextField input etc
17 | //$v-app-background-color: $v-background-color;
18 |
19 | // Affects the visual appearance of all components
20 | //$v-gradient: v-linear 8%;
21 | //$v-bevel-depth: 30%;
22 | //$v-shadow-opacity: 5%;
23 |
24 | // Defines colors for indicating status (focus, success, failure)
25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically
26 | //$v-friendly-color: #2c9720;
27 | //$v-error-indicator-color: #ed473b;
28 |
29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html
30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples
31 |
32 | @import "../valo/valo.scss";
33 |
34 | @mixin analysistheme {
35 | @include valo;
36 |
37 | // Insert your own theme rules here
38 | }
--------------------------------------------------------------------------------
/analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/styles.scss:
--------------------------------------------------------------------------------
1 | @import "analysistheme.scss";
2 | @import "addons.scss";
3 |
4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes.
5 | // The actual styles should be defined in analysisheme.scss
6 |
7 | .analysistheme {
8 | @include addons;
9 | @include analysistheme;
10 | }
--------------------------------------------------------------------------------
/bin/create-es-index.sh:
--------------------------------------------------------------------------------
1 | # $1 - index name (docs, http_sources)
2 | # $2 - ES index config file name
3 | # $3 - ES host
4 | # $4 - application name
5 |
6 | if [ -z "$4" ]
7 | then
8 | export INDEX_URL="http://$3:9200/$1_v1"
9 | else
10 | export INDEX_URL="http://$3:9200/$4-$1_v1"
11 | fi
12 |
13 |
14 | curl -H "Content-Type:application/json" -XDELETE "$INDEX_URL"
15 | echo
16 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL" -d @elasticsearch/src/main/resources/indices/$2
17 | echo
18 | if [ -z "$4" ]
19 | then
20 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$1"
21 | echo
22 | else
23 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$4-$1"
24 | echo
25 | fi
26 |
--------------------------------------------------------------------------------
/bin/create-es-indices.sh:
--------------------------------------------------------------------------------
1 | bin/create-es-index.sh docs document.json ${1:-localhost} $2
2 | bin/create-es-index.sh named_queries query.json ${1:-localhost} $2
3 | bin/create-es-index.sh http_sources http_source.json ${1:-localhost} $2
4 | bin/create-es-index.sh http_source_tests http_source_test.json ${1:-localhost} $2
5 | bin/create-es-index.sh urls url.json ${1:-localhost} $2
6 |
--------------------------------------------------------------------------------
/bin/deploy-crawler.sh:
--------------------------------------------------------------------------------
1 | STORM_HOME=/opt/storm/apache-storm-1.1.1
2 | mvn clean install -Pbigjar -Dstorm.scope=provided
3 | $STORM_HOME/bin/storm jar crawler/target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -conf crawler/conf/local.yaml
4 |
--------------------------------------------------------------------------------
/bin/run-administration-ui.sh:
--------------------------------------------------------------------------------
1 | ( cd administration-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/administration-ui-standalone.jar )
2 |
--------------------------------------------------------------------------------
/bin/run-analysis-ui.sh:
--------------------------------------------------------------------------------
1 | ( cd analysis-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/analysis-ui-standalone.jar )
--------------------------------------------------------------------------------
/bin/run-crawler.sh:
--------------------------------------------------------------------------------
1 | ( cd crawler && mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests && java -cp target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -local -conf conf/local.yaml )
2 |
--------------------------------------------------------------------------------
/crawler/conf/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | fetcher.server.delay: 4.5
2 | fetcher.server.min.delay: 3.0
3 | fetcher.queue.mode: "byHost"
4 | fetcher.threads.per.queue: 1
5 | fetcher.threads.number: 5
6 |
7 | partition.url.mode: "byHost"
8 |
9 | metadata.track.path: false
10 | metadata.track.depth: false
11 | metadata.transfer:
12 | - "source"
13 |
14 | http.agent.name: "NewsRadar"
15 | http.agent.version: "1.0"
16 | http.agent.description: "News Crawler"
17 | http.agent.url: ""
18 | http.agent.email: ""
19 |
20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
22 | http.content.limit: 1048576
23 | http.store.responsetime: false
24 | http.timeout: 30000
25 |
26 | http.robots.403.allow: true
27 |
28 | protocols: "http,https"
29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
31 |
32 | urlfilters.config.file: "urlfilters.json"
33 |
34 | # revisit a page monthly (value in minutes)
35 | fetchInterval.default: 44640
36 |
37 | # revisit a page with a fetch error after 2 hours (value in minutes)
38 | fetchInterval.fetch.error: 120
39 |
40 | # revisit a page with an error every month (value in minutes)
41 | fetchInterval.error: 44640
42 |
43 | # Default implementation of Scheduler
44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
45 |
46 | topology.workers: 1
47 | topology.sleep.spout.wait.strategy.time.ms: 5000
48 | topology.message.timeout.secs: 300
49 | topology.max.spout.pending: 100
50 | topology.debug: false
51 |
52 | # ElasticSearch configuration
53 | es.hostname: "elasticsearch"
54 | es.rest.port: 9200
55 |
56 | es.urls.index.name: "urls"
57 | es.urls.doc.type: "url"
58 | es.docs.index.name: "docs"
59 | es.docs.doc.type: "doc"
60 | es.httpsource.index.name: "http_sources"
61 | es.httpsource.doc.type: "http_source"
62 |
63 | # MetricsConsumer configuration
64 | es.metrics.addresses: "elasticsearch:9300"
65 | es.metrics.index.name: "metrics"
66 | es.metrics.doc.type: "datapoint"
67 | es.metrics.cluster.name: "elasticsearch"
68 | es.metrics.blacklist:
69 | - "__"
70 | - "uptime"
71 | - "memory"
72 | - "GC"
73 | - "newWorkerEvent"
74 | - "startTimeSecs"
75 |
--------------------------------------------------------------------------------
/crawler/conf/local.yaml:
--------------------------------------------------------------------------------
1 | fetcher.server.delay: 4.5
2 | fetcher.server.min.delay: 3.0
3 | fetcher.queue.mode: "byHost"
4 | fetcher.threads.per.queue: 1
5 | fetcher.threads.number: 5
6 |
7 | partition.url.mode: "byHost"
8 |
9 | metadata.track.path: false
10 | metadata.track.depth: false
11 | metadata.transfer:
12 | - "source"
13 |
14 | http.agent.name: "NewsRadar"
15 | http.agent.version: "1.0"
16 | http.agent.description: "News Crawler"
17 | http.agent.url: ""
18 | http.agent.email: ""
19 |
20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
22 | http.content.limit: 1048576
23 | http.store.responsetime: false
24 | http.timeout: 30000
25 |
26 | http.robots.403.allow: true
27 |
28 | protocols: "http,https"
29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
31 |
32 | urlfilters.config.file: "urlfilters.json"
33 |
34 | # revisit a page monthly (value in minutes)
35 | fetchInterval.default: 44640
36 |
37 | # revisit a page with a fetch error after 2 hours (value in minutes)
38 | fetchInterval.fetch.error: 120
39 |
40 | # revisit a page with an error every month (value in minutes)
41 | fetchInterval.error: 44640
42 |
43 | # Default implementation of Scheduler
44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
45 |
46 | topology.workers: 1
47 | topology.sleep.spout.wait.strategy.time.ms: 5000
48 | topology.message.timeout.secs: 300
49 | topology.max.spout.pending: 100
50 | topology.debug: false
51 |
52 | # ElasticSearch configuration
53 | es.hostname: "localhost"
54 | es.rest.port: 9200
55 |
56 | es.urls.index.name: "urls"
57 | es.urls.doc.type: "url"
58 | es.docs.index.name: "docs"
59 | es.docs.doc.type: "doc"
60 | es.httpsource.index.name: "http_sources"
61 | es.httpsource.doc.type: "http_source"
62 |
63 | # MetricsConsumer configuration
64 | es.metrics.addresses: "localhost:9300"
65 | es.metrics.index.name: "metrics"
66 | es.metrics.doc.type: "datapoint"
67 | es.metrics.cluster.name: "elasticsearch"
68 | es.metrics.blacklist:
69 | - "__"
70 | - "uptime"
71 | - "memory"
72 | - "GC"
73 | - "newWorkerEvent"
74 | - "startTimeSecs"
75 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerConstants.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler;
2 |
3 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
4 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import java.util.concurrent.TimeUnit;
9 |
10 | public class CrawlerConstants {
11 | private static final Logger LOG = LoggerFactory.getLogger(CrawlerConstants.class);
12 |
13 | private static long getReloadDelayInSeconds() {
14 | long reloadDelay = 300;
15 | String envVar = System.getenv("DEFAULT_SOURCE_RELOAD_DELAY");
16 | if (envVar != null) {
17 | try {
18 | reloadDelay = Long.parseLong(envVar);
19 | } catch (NumberFormatException e) {
20 | LOG.warn("Environment variable 'DEFAULT_SOURCE_RELOAD_DELAY' is not a number '{}'", envVar);
21 | }
22 | }
23 | return reloadDelay;
24 | }
25 |
26 | public static final long MIN_FETCH_DELAY = TimeUnit.MINUTES.toMillis(1);
27 | public static final long DEFAULT_URL_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10);
28 | public static final long DEFAULT_FEED_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10);
29 | public static final long DEFAULT_SITEMAP_FETCH_DELAY = TimeUnit.MINUTES.toMillis(30);
30 | public static final long DEFAULT_SOURCE_RELOAD_DELAY = TimeUnit.SECONDS.toMillis(getReloadDelayInSeconds());
31 |
32 | public static final String META_IS_SITEMAP = SiteMapParserBolt.isSitemapKey;
33 | public static final String META_IS_FEED = FeedParserBolt.isFeedKey;
34 | public static final String META_IS_SEED = "isSeed";
35 | public static final String META_SOURCE = "source";
36 | public static final String META_PUBLISHED = "published";
37 | public static final String META_DISCOVERED = "discovered";
38 | public static final String META_FEED_PUBLISHED = "feed.publishedDate";
39 |
40 | public static final String URL_FILTERS_FILE = "urlfilters.config.file";
41 |
42 | public static final String PARTIAL_ANALYSIS_STATUS = "PARTIAL_ANALYSIS";
43 | }
44 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerTopology.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler;
2 |
3 | import com.digitalpebble.stormcrawler.ConfigurableTopology;
4 | import com.digitalpebble.stormcrawler.Constants;
5 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
6 | import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
7 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
8 | import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
9 | import lt.tokenmill.crawling.crawler.bolt.ArticleIndexerBolt;
10 | import lt.tokenmill.crawling.crawler.bolt.LinkExtractorBolt;
11 | import lt.tokenmill.crawling.crawler.bolt.StatusUpdaterBolt;
12 | import lt.tokenmill.crawling.crawler.spout.UrlGeneratorSpout;
13 | import org.apache.storm.Config;
14 | import org.apache.storm.topology.IRichBolt;
15 | import org.apache.storm.topology.IRichSpout;
16 | import org.apache.storm.topology.TopologyBuilder;
17 | import org.apache.storm.tuple.Fields;
18 |
19 | public class CrawlerTopology extends ConfigurableTopology {
20 |
21 | private final ServiceProvider serviceProvider;
22 |
23 | public static void main(String[] args) throws Exception {
24 | ConfigurableTopology.start(new CrawlerTopology(), args);
25 | }
26 |
27 | public CrawlerTopology() {
28 | this(new DefaultServiceProvider());
29 | }
30 |
31 | public CrawlerTopology(ServiceProvider serviceProvider) {
32 | this.serviceProvider = serviceProvider;
33 | }
34 |
35 | @Override
36 | protected int run(String[] strings) {
37 | TopologyBuilder builder = new TopologyBuilder();
38 |
39 | builder.setSpout("generator", createUrlGeneratorSpout(serviceProvider));
40 |
41 | builder.setBolt("partitioner", new URLPartitionerBolt())
42 | .shuffleGrouping("generator");
43 |
44 | builder.setBolt("fetch", new FetcherBolt())
45 | .fieldsGrouping("partitioner", new Fields("key"));
46 |
47 | builder.setBolt("sitemap", new SiteMapParserBolt())
48 | .localOrShuffleGrouping("fetch");
49 |
50 | builder.setBolt("feed", new FeedParserBolt())
51 | .localOrShuffleGrouping("sitemap");
52 |
53 | builder.setBolt("links", createLinkExtractor(serviceProvider))
54 | .localOrShuffleGrouping("feed");
55 |
56 | builder.setBolt("index", createArticleIndexer(serviceProvider))
57 | .localOrShuffleGrouping("fetch");
58 |
59 | builder.setBolt("status", createStatusUpdater(serviceProvider))
60 | .localOrShuffleGrouping("fetch", Constants.StatusStreamName)
61 | .localOrShuffleGrouping("sitemap", Constants.StatusStreamName)
62 | .localOrShuffleGrouping("index", Constants.StatusStreamName)
63 | .localOrShuffleGrouping("links", Constants.StatusStreamName);
64 |
65 | String topologyName = (String) conf.getOrDefault(Config.TOPOLOGY_NAME, "crawler");
66 | System.setProperty("es.set.netty.runtime.available.processors", "false");
67 | return submit(topologyName, conf, builder);
68 | }
69 |
70 | protected IRichSpout createUrlGeneratorSpout(ServiceProvider serviceProvider) {
71 | return new UrlGeneratorSpout(serviceProvider);
72 | }
73 |
74 | protected IRichBolt createLinkExtractor(ServiceProvider serviceProvider) {
75 | return new LinkExtractorBolt(serviceProvider);
76 | }
77 |
78 | protected IRichBolt createArticleIndexer(ServiceProvider serviceProvider) {
79 | return new ArticleIndexerBolt(serviceProvider);
80 | }
81 |
82 | protected IRichBolt createStatusUpdater(ServiceProvider serviceProvider) {
83 | return new StatusUpdaterBolt(serviceProvider);
84 | }
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler;
2 |
3 | import com.digitalpebble.stormcrawler.util.ConfUtils;
4 | import com.google.common.collect.Maps;
5 | import lt.tokenmill.crawling.es.*;
6 | import org.slf4j.Logger;
7 | import org.slf4j.LoggerFactory;
8 |
9 | import java.io.Serializable;
10 | import java.util.Map;
11 |
12 | public class DefaultServiceProvider implements ServiceProvider, Serializable {
13 |
14 | private static final Logger LOG = LoggerFactory.getLogger(DefaultServiceProvider.class);
15 |
16 | private static final Map ES_CONNECTIONS = Maps.newConcurrentMap();
17 |
18 | public static ElasticConnection getElasticConnection(Map conf) {
19 | String hostname = ConfUtils.getString(conf, ElasticConstants.ES_HOSTNAME_PARAM);
20 | int restPort = ConfUtils.getInt(conf, ElasticConstants.ES_REST_PORT, 9200);
21 | String restScheme = ConfUtils.getString(conf, ElasticConstants.ES_REST_SCHEME, "http");
22 | if (ES_CONNECTIONS.containsKey(hostname)) {
23 | return ES_CONNECTIONS.get(hostname);
24 | } else {
25 | ElasticConnection elasticConnection = ElasticConnection.getConnection(hostname, restPort, restScheme);
26 | ES_CONNECTIONS.put(hostname, elasticConnection);
27 | return ES_CONNECTIONS.get(hostname);
28 | }
29 | }
30 |
31 | public EsHttpUrlOperations createEsHttpUrlOperations(Map conf) {
32 | ElasticConnection connection = getElasticConnection(conf);
33 | String urlsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_URLS_INDEX_NAME_PARAM);
34 | String urlsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_URLS_DOC_TYPE_PARAM);
35 | return EsHttpUrlOperations.getInstance(connection, urlsIndexName, urlsDocumentType);
36 | }
37 |
38 | public EsHttpSourceOperations createEsHttpSourceOperations(Map conf) {
39 | ElasticConnection connection = getElasticConnection(conf);
40 | String sourcesIndexName = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM);
41 | String sourcesDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM);
42 | return EsHttpSourceOperations.getInstance(connection, sourcesIndexName, sourcesDocumentType);
43 | }
44 |
45 | public EsDocumentOperations creatEsDocumentOperations(Map conf) {
46 | ElasticConnection connection = getElasticConnection(conf);
47 | String docsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_INDEX_NAME_PARAM);
48 | String docsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_DOC_TYPE_PARAM);
49 | return EsDocumentOperations.getInstance(connection, docsIndexName, docsDocumentType);
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler;
2 |
3 | import lt.tokenmill.crawling.es.EsDocumentOperations;
4 | import lt.tokenmill.crawling.es.EsHttpSourceOperations;
5 | import lt.tokenmill.crawling.es.EsHttpUrlOperations;
6 |
7 | import java.util.Map;
8 |
9 | /***
10 | * Interface for external service factory.
11 | */
12 | public interface ServiceProvider {
13 |
14 | EsHttpUrlOperations createEsHttpUrlOperations(Map conf);
15 |
16 | EsHttpSourceOperations createEsHttpSourceOperations(Map conf);
17 |
18 | EsDocumentOperations creatEsDocumentOperations(Map conf);
19 | }
20 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.bolt;
2 |
3 | import com.digitalpebble.stormcrawler.Metadata;
4 | import com.digitalpebble.stormcrawler.persistence.AbstractStatusUpdaterBolt;
5 | import com.digitalpebble.stormcrawler.persistence.Status;
6 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
7 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider;
8 | import lt.tokenmill.crawling.crawler.ServiceProvider;
9 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
10 | import lt.tokenmill.crawling.data.HttpSource;
11 | import lt.tokenmill.crawling.es.*;
12 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
13 | import org.apache.storm.metric.api.MultiCountMetric;
14 | import org.apache.storm.task.OutputCollector;
15 | import org.apache.storm.task.TopologyContext;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | import java.util.Date;
20 | import java.util.Map;
21 |
22 | public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt {
23 |
24 | private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class);
25 |
26 | private MultiCountMetric eventCounter;
27 |
28 | private EsHttpUrlOperations esUrlsOperations;
29 | private EsHttpSourceOperations esHttpSourcesOperations;
30 | private ServiceProvider serviceProvider;
31 |
32 | public StatusUpdaterBolt(ServiceProvider serviceProvider) {
33 | this.serviceProvider = serviceProvider;
34 | }
35 |
36 | @Override
37 | public void store(String url, Status status, Metadata metadata, Date nextFetch) throws Exception {
38 | try {
39 | String source = metadata.getFirstValue(CrawlerConstants.META_SOURCE);
40 | Boolean isSeed = Boolean.parseBoolean(metadata.getFirstValue(CrawlerConstants.META_IS_SEED));
41 | HttpSource httpSource = EsHttpSourcesCache.get(esHttpSourcesOperations, source);
42 | UrlFilters filters = UrlFiltersCache.get(httpSource);
43 |
44 | String filtered = filters.filter(url);
45 | if (isSeed || (filtered == null && status.equals(Status.DISCOVERED))) {
46 | LOG.debug("Url '{}' is seed or rejected by filters", url);
47 | return;
48 | }
49 |
50 | String id = (filtered == null) ? url : filtered;
51 |
52 | LOG.debug("Setting '{}' status to '{}'", id, status);
53 |
54 |
55 | boolean create = status.equals(Status.DISCOVERED);
56 | String published = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED);
57 | if (published == null) {
58 | published = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED);
59 | }
60 | esUrlsOperations.upsertUrlStatus(id, published, source, create, status);
61 |
62 | if (status == Status.DISCOVERED) {
63 | eventCounter.scope("urls_discovered").incr();
64 | }
65 | } catch (Exception e) {
66 | LOG.error("Failed to set status for url '{}'", url, e);
67 | }
68 | }
69 |
70 |
71 | @Override
72 | public void prepare(Map conf, TopologyContext context, OutputCollector outputCollector) {
73 | super.prepare(conf, context, outputCollector);
74 | this.eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10);
75 | this.esUrlsOperations = this.serviceProvider.createEsHttpUrlOperations(conf);
76 | this.esHttpSourcesOperations = this.serviceProvider.createEsHttpSourceOperations(conf);
77 | }
78 |
79 | @Override
80 | public void cleanup() {
81 | super.cleanup();
82 | }
83 | }
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/HttpSourceConfiguration.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.spout;
2 |
3 | import com.google.common.collect.Iterables;
4 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
5 | import lt.tokenmill.crawling.crawler.utils.PrioritizedSource;
6 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
7 | import lt.tokenmill.crawling.data.HttpSource;
8 | import lt.tokenmill.crawling.es.EsHttpSourcesCache;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.util.Iterator;
13 | import java.util.List;
14 | import java.util.PriorityQueue;
15 | import java.util.stream.Collectors;
16 |
17 | import static java.lang.System.currentTimeMillis;
18 |
19 | public class HttpSourceConfiguration {
20 |
21 | private static final Logger LOG = LoggerFactory.getLogger(HttpSourceConfiguration.class);
22 |
23 | private final List sources;
24 | private final List sourceUrls;
25 | private final Iterator sourceCycle;
26 | private final PriorityQueue prioritizedSources;
27 |
28 | private static long lastReloadMillis = 0;
29 |
30 | private HttpSourceConfiguration(List sources) {
31 | this.sources = sources;
32 | this.sourceUrls = sources.stream()
33 | .map(HttpSource::getUrl)
34 | .collect(Collectors.toList());
35 | LOG.info("Loaded {} active HTTP sources", this.sourceUrls.size());
36 | this.sourceCycle = Iterables.cycle(this.sourceUrls).iterator();
37 | this.prioritizedSources =
38 | new PriorityQueue<>(new PrioritizedSource.PrioritizedUrlComparator());
39 | sources.forEach(s -> {
40 | s.getUrls().forEach(u -> prioritizedSources.offer(PrioritizedSource.createUrl(u, s)));
41 | s.getFeeds().forEach(u -> prioritizedSources.offer(PrioritizedSource.createFeed(u, s)));
42 | s.getSitemaps().forEach(u -> prioritizedSources.offer(PrioritizedSource.createSitemap(u, s)));
43 | });
44 | }
45 |
46 | public PrioritizedSource prioritized() {
47 | PrioritizedSource prioritized = prioritizedSources.peek();
48 | if (prioritized != null &&
49 | (prioritized.getNextFetchTime() <= currentTimeMillis())) {
50 | prioritized = prioritizedSources.poll();
51 | prioritized.recalculateNextFetchTime();
52 | prioritizedSources.offer(prioritized);
53 | return prioritized;
54 | }
55 | return null;
56 | }
57 |
58 | public int maxTries() {
59 | return Math.min(10, sourceUrls.size());
60 | }
61 |
62 | public boolean hasNextActive() {
63 | return sourceCycle.hasNext();
64 | }
65 |
66 |
67 | public String nextActive() {
68 | return sourceCycle.next();
69 | }
70 |
71 | public static HttpSourceConfiguration reload(HttpSourceConfiguration current, List sources) {
72 | HttpSourceConfiguration configuration;
73 | if (current != null && current.sources.equals(sources)) {
74 | LOG.info("HTTP source configuration didn't change. Using current version");
75 | configuration = current;
76 | } else {
77 | configuration = new HttpSourceConfiguration(sources);
78 | EsHttpSourcesCache.invalidate();
79 | UrlFiltersCache.invalidate();
80 | }
81 | lastReloadMillis = currentTimeMillis();
82 | return configuration;
83 | }
84 |
85 | public static boolean needsReload() {
86 | LOG.info("Checking reloading timeout. Remaining milliseconds: {}",
87 | lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY - currentTimeMillis());
88 | return lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY < currentTimeMillis();
89 | }
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/PrioritizedSource.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.utils;
2 |
3 |
4 | import lt.tokenmill.crawling.data.HttpSource;
5 |
6 | import java.io.Serializable;
7 | import java.util.Comparator;
8 | import java.util.concurrent.TimeUnit;
9 |
10 | import static lt.tokenmill.crawling.crawler.CrawlerConstants.*;
11 |
12 | public class PrioritizedSource implements Serializable {
13 |
14 |
15 |
16 | private final String url;
17 |
18 | private final HttpSource source;
19 |
20 | private Long delay = MIN_FETCH_DELAY;
21 |
22 | private boolean sitemap = false;
23 | private boolean feed = false;
24 |
25 | private Long nextFetchTime = System.currentTimeMillis();
26 |
27 | private PrioritizedSource(String url, HttpSource source) {
28 | this.url = url;
29 | this.source = source;
30 | }
31 |
32 | private void setDelay(Long delay) {
33 | this.delay = Math.max(delay, MIN_FETCH_DELAY);
34 | }
35 |
36 | private void setSitemap(boolean sitemap) {
37 | this.sitemap = sitemap;
38 | }
39 |
40 | private void setFeed(boolean feed) {
41 | this.feed = feed;
42 | }
43 |
44 | public void recalculateNextFetchTime() {
45 | nextFetchTime = System.currentTimeMillis() + delay;
46 | }
47 |
48 | public String getUrl() {
49 | return url;
50 | }
51 |
52 | public boolean isSitemap() {
53 | return sitemap;
54 | }
55 |
56 | public boolean isFeed() {
57 | return feed;
58 | }
59 |
60 | public HttpSource getSource() {
61 | return source;
62 | }
63 |
64 | public long getNextFetchTime() {
65 | return nextFetchTime;
66 | }
67 |
68 | public static class PrioritizedUrlComparator implements Comparator, Serializable {
69 |
70 | @Override
71 | public int compare(PrioritizedSource u1, PrioritizedSource u2) {
72 | return u1.nextFetchTime.compareTo(u2.nextFetchTime);
73 | }
74 | }
75 |
76 | public static PrioritizedSource createUrl(String url, HttpSource source) {
77 | PrioritizedSource result = new PrioritizedSource(url, source);
78 | long delay = source.getUrlRecrawlDelayInSecs() != null ?
79 | TimeUnit.SECONDS.toMillis(source.getUrlRecrawlDelayInSecs()) : DEFAULT_URL_FETCH_DELAY;
80 | result.setDelay(delay);
81 | return result;
82 | }
83 |
84 | public static PrioritizedSource createFeed(String url, HttpSource source) {
85 | PrioritizedSource result = new PrioritizedSource(url, source);
86 | long delay = source.getFeedRecrawlDelayInSecs() != null ?
87 | TimeUnit.SECONDS.toMillis(source.getFeedRecrawlDelayInSecs()) : DEFAULT_FEED_FETCH_DELAY;
88 | result.setDelay(delay);
89 | result.setFeed(true);
90 | return result;
91 | }
92 |
93 | public static PrioritizedSource createSitemap(String url, HttpSource source) {
94 | PrioritizedSource result = new PrioritizedSource(url, source);
95 | long delay = source.getSitemapRecrawlDelayInSecs() != null ?
96 | TimeUnit.SECONDS.toMillis(source.getSitemapRecrawlDelayInSecs()) : DEFAULT_SITEMAP_FETCH_DELAY;
97 | result.setDelay(delay);
98 | result.setSitemap(true);
99 | return result;
100 | }
101 | }
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFilterUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.utils;
2 |
3 | import com.digitalpebble.stormcrawler.Metadata;
4 | import com.digitalpebble.stormcrawler.filtering.URLFilters;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import java.io.IOException;
9 | import java.net.URL;
10 | import java.util.Map;
11 |
12 | public class UrlFilterUtils {
13 |
14 | private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtils.class);
15 |
16 | public static URLFilters load(Map conf, String filtersConfigFile) {
17 | if (filtersConfigFile != null) {
18 | try {
19 | URLFilters loaded = new URLFilters(conf, filtersConfigFile);
20 | LOG.info("Loaded URLFilters from '{}'", filtersConfigFile);
21 | return loaded;
22 | } catch (IOException e) {
23 | LOG.error("Exception caught while loading the URLFilters");
24 | throw new RuntimeException("Exception caught while loading the URLFilters", e);
25 | }
26 | } else {
27 | return URLFilters.emptyURLFilters;
28 | }
29 | }
30 |
31 | public static String firstMatch(URL sourceUrl, Metadata metadata, String targetUrl, URLFilters...filters) {
32 | for (URLFilters filter : filters) {
33 | String filtered = filter.filter(sourceUrl, metadata, targetUrl);
34 | if (filtered != null) {
35 | return filtered;
36 | }
37 | }
38 | return null;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFiltersCache.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.utils;
2 |
3 | import com.google.common.cache.Cache;
4 | import com.google.common.cache.CacheBuilder;
5 | import lt.tokenmill.crawling.data.HttpSource;
6 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
7 |
8 | import java.util.concurrent.TimeUnit;
9 |
10 | public class UrlFiltersCache {
11 |
12 | private static final Cache CACHE;
13 |
14 | static {
15 | CACHE = CacheBuilder.newBuilder()
16 | .expireAfterWrite(10, TimeUnit.MINUTES)
17 | .build();
18 | }
19 |
20 | public static UrlFilters get(HttpSource source) {
21 | UrlFilters filters = CACHE.getIfPresent(source.getUrl());
22 | if (filters == null) {
23 | filters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters());
24 | CACHE.put(source.getUrl(), filters);
25 | }
26 | return filters;
27 | }
28 |
29 | public static void invalidate() {
30 | CACHE.invalidateAll();
31 | }
32 | }
--------------------------------------------------------------------------------
/crawler/src/main/resources/urlfilters.json:
--------------------------------------------------------------------------------
1 | {
2 | "com.digitalpebble.stormcrawler.filtering.URLFilters": [
3 | {
4 | "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
5 | "name": "RegexURLFilter",
6 | "params": {
7 | "regexFilterFile": "urlfilters.txt"
8 | }
9 | }
10 | ]
11 | }
--------------------------------------------------------------------------------
/crawler/src/main/resources/urlfilters.txt:
--------------------------------------------------------------------------------
1 | #Discard URLs, longer than 512 chars
2 | -.{512,}
3 |
4 | #Discard urls which are actually links to other urls
5 | -^https?://.*https?:.*
6 |
7 | #Discard urls containing illegal characters: space, %20 or #
8 | -.*(:?%20| |#|\@).*
9 |
10 | #Discard media or binary files
11 | -(?i).*\.(exe|dmg|csv|mp3|mp4|m4a|avi|mov|swf|wmv|dat|mpg|mpg4|flm|mtv|video|divx|mpeg4|film|xwmv|exo|pdf|jpg|jpeg|png|bmp|gif|doc|docx|xls|xlsx|ppt|pptx|rss)$
12 |
13 | #Allow everything else
14 | +.
--------------------------------------------------------------------------------
/crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlFilterUtilsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.spout;
2 |
3 | import com.digitalpebble.stormcrawler.Metadata;
4 | import com.digitalpebble.stormcrawler.filtering.URLFilters;
5 | import com.digitalpebble.stormcrawler.util.ConfUtils;
6 | import lt.tokenmill.crawling.crawler.CrawlerConstants;
7 | import lt.tokenmill.crawling.crawler.utils.UrlFilterUtils;
8 | import org.junit.Test;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.net.MalformedURLException;
13 | import java.net.URL;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 |
17 | import static org.junit.Assert.assertNotNull;
18 | import static org.junit.Assert.assertNull;
19 |
20 | public class UrlFilterUtilsTest {
21 |
22 | private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtilsTest.class);
23 | private final String testSourceUrl = "http://www.tokenmill.lt/";
24 |
25 | @Test
26 | public void testUrlFilters() {
27 | Map conf = new HashMap();
28 | conf.put(CrawlerConstants.URL_FILTERS_FILE, "urlfilters.json");
29 | String filtersConfigFile = ConfUtils.getString(conf, CrawlerConstants.URL_FILTERS_FILE);
30 | URLFilters filters = UrlFilterUtils.load(conf, filtersConfigFile);
31 | URL sourceUrl;
32 | try {
33 | sourceUrl = new URL(testSourceUrl);
34 | } catch (MalformedURLException e) {
35 | // we would have known by now as previous components check whether the URL is valid
36 | LOG.error("MalformedURLException on {}", testSourceUrl);
37 | return;
38 | }
39 | // test good URL
40 | assertNotNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl, filters));
41 | // test on bad URL
42 | assertNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl.concat("song.mp3"), filters));
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpoutTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.crawler.spout;
2 |
3 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider;
4 | import org.junit.Test;
5 |
6 | public class UrlGeneratorSpoutTest {
7 |
8 |
9 | @Test
10 | public void test() {
11 | UrlGeneratorSpout spout = new UrlGeneratorSpout(new DefaultServiceProvider());
12 | }
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/data-model/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | crawling-framework
7 | lt.tokenmill.crawling
8 | 0.3.4-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | data-model
13 |
14 |
15 |
16 |
17 | joda-time
18 | joda-time
19 |
20 |
21 | com.google.guava
22 | guava
23 |
24 |
25 | junit
26 | junit
27 | 4.13.1
28 | test
29 |
30 |
31 |
32 |
33 |
34 | release
35 |
36 |
37 |
38 | org.apache.maven.plugins
39 | maven-source-plugin
40 |
41 |
42 |
43 | org.apache.maven.plugins
44 | maven-jar-plugin
45 |
46 |
47 |
48 | org.apache.maven.plugins
49 | maven-javadoc-plugin
50 |
51 |
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/DataUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import com.google.common.base.Splitter;
4 | import org.joda.time.DateTime;
5 | import org.joda.time.DateTimeZone;
6 | import org.joda.time.format.DateTimeFormat;
7 | import org.joda.time.format.DateTimeFormatter;
8 |
9 | import java.io.Serializable;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 |
13 | public class DataUtils implements Serializable {
14 |
15 | private static final DateTimeFormatter FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss");
16 |
17 | public static Integer tryParseInteger(Object object) {
18 | try {
19 | return (object != null) ? Integer.parseInt(object.toString()) : null;
20 | } catch (NumberFormatException e) {
21 | }
22 | return null;
23 | }
24 |
25 | public static Long tryParseLong(Object object) {
26 | try {
27 | return (object != null) ? Long.parseLong(object.toString()) : null;
28 | } catch (NumberFormatException e) {
29 | }
30 | return null;
31 | }
32 |
33 | public static List parseStringList(Object object) {
34 | if (object == null) {
35 | return null;
36 | }
37 | return Splitter.onPattern("(?:\r?\n)+")
38 | .splitToList(object.toString())
39 | .stream()
40 | .map(String::trim)
41 | .filter(s -> !s.isEmpty())
42 | .collect(Collectors.toList());
43 | }
44 |
45 | public static String formatInUTC(DateTime date) {
46 | return date != null ? FORMATTER.print(date.toDateTime(DateTimeZone.UTC)) : null;
47 | }
48 |
49 | public static DateTime parseFromUTC(String date) {
50 | return date != null ? FORMATTER.parseDateTime(date) : null;
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HighlightedSearchResult.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 | import java.util.List;
5 |
6 | public class HighlightedSearchResult implements Serializable {
7 |
8 | private HttpArticle article;
9 |
10 | private List highlights;
11 |
12 | public HighlightedSearchResult(HttpArticle article, List highlights) {
13 | this.article = article;
14 | this.highlights = highlights;
15 | }
16 |
17 | public HttpArticle getArticle() {
18 | return article;
19 | }
20 |
21 | public void setArticle(HttpArticle article) {
22 | this.article = article;
23 | }
24 |
25 | public List getHighlights() {
26 | return highlights;
27 | }
28 |
29 | public void setHighlights(List highlights) {
30 | this.highlights = highlights;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HtmlAnalysisResult.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | public class HtmlAnalysisResult implements Serializable {
8 |
9 | private String url;
10 | private String title;
11 | private Integer httpStatus;
12 | private List links;
13 | private Map headers;
14 | private List metaValues;
15 |
16 | private String robotsTxt;
17 | private Boolean robotsAllowedAll;
18 | private Boolean robotsAllowedNone;
19 | private Boolean robotsAllowedHome;
20 | private List robotsSitemaps;
21 | private Long robotsCrawlDelay;
22 |
23 | public String getUrl() {
24 | return url;
25 | }
26 |
27 | public void setUrl(String url) {
28 | this.url = url;
29 | }
30 |
31 | public String getTitle() {
32 | return title;
33 | }
34 |
35 | public void setTitle(String title) {
36 | this.title = title;
37 | }
38 |
39 | public List getLinks() {
40 | return links;
41 | }
42 |
43 | public void setLinks(List links) {
44 | this.links = links;
45 | }
46 |
47 | public List getMetaValues() {
48 | return metaValues;
49 | }
50 |
51 | public void setMetaValues(List metaValues) {
52 | this.metaValues = metaValues;
53 | }
54 |
55 | public String getRobotsTxt() {
56 | return robotsTxt;
57 | }
58 |
59 | public void setRobotsTxt(String robotsTxt) {
60 | this.robotsTxt = robotsTxt;
61 | }
62 |
63 | public Boolean getRobotsAllowedAll() {
64 | return robotsAllowedAll;
65 | }
66 |
67 | public void setRobotsAllowedAll(Boolean robotsAllowedAll) {
68 | this.robotsAllowedAll = robotsAllowedAll;
69 | }
70 |
71 | public Boolean getRobotsAllowedNone() {
72 | return robotsAllowedNone;
73 | }
74 |
75 | public void setRobotsAllowedNone(Boolean robotsAllowedNone) {
76 | this.robotsAllowedNone = robotsAllowedNone;
77 | }
78 |
79 | public Boolean getRobotsAllowedHome() {
80 | return robotsAllowedHome;
81 | }
82 |
83 | public void setRobotsAllowedHome(Boolean robotsAllowedHome) {
84 | this.robotsAllowedHome = robotsAllowedHome;
85 | }
86 |
87 | public List getRobotsSitemaps() {
88 | return robotsSitemaps;
89 | }
90 |
91 | public void setRobotsSitemaps(List robotsSitemaps) {
92 | this.robotsSitemaps = robotsSitemaps;
93 | }
94 |
95 | public Long getRobotsCrawlDelay() {
96 | return robotsCrawlDelay;
97 | }
98 |
99 | public void setRobotsCrawlDelay(Long robotsCrawlDelay) {
100 | this.robotsCrawlDelay = robotsCrawlDelay;
101 | }
102 |
103 | public Integer getHttpStatus() {
104 | return httpStatus;
105 | }
106 |
107 | public void setHttpStatus(Integer httpStatus) {
108 | this.httpStatus = httpStatus;
109 | }
110 |
111 | public Map getHeaders() {
112 | return headers;
113 | }
114 |
115 | public void setHeaders(Map headers) {
116 | this.headers = headers;
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticle.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 |
4 | import org.joda.time.DateTime;
5 |
6 | import java.io.Serializable;
7 | import java.util.List;
8 |
9 | public class HttpArticle implements Serializable {
10 |
11 | private String source;
12 |
13 | private String language;
14 |
15 | private String url;
16 |
17 | private String title;
18 |
19 | private String text;
20 |
21 | private String textSignature;
22 |
23 | private List appIds;
24 |
25 | private DateTime published;
26 |
27 | private DateTime discovered;
28 |
29 | private List categories;
30 |
31 | public String getSource() {
32 | return source;
33 | }
34 |
35 | public String getUrl() {
36 | return url;
37 | }
38 |
39 | public void setSource(String source) {
40 | this.source = source;
41 | }
42 |
43 | public DateTime getPublished() {
44 | return published;
45 | }
46 |
47 | public void setPublished(DateTime published) {
48 | this.published = published;
49 | }
50 |
51 | public DateTime getDiscovered() {
52 | return discovered;
53 | }
54 |
55 | public void setDiscovered(DateTime discovered) {
56 | this.discovered = discovered;
57 | }
58 |
59 | public void setUrl(String url) {
60 | this.url = url;
61 | }
62 |
63 | public String getTitle() {
64 | return title;
65 | }
66 |
67 | public void setTitle(String title) {
68 | this.title = title;
69 | }
70 |
71 | public String getText() {
72 | return text;
73 | }
74 |
75 | public void setText(String text) {
76 | this.text = text;
77 | }
78 |
79 | public List getAppIds() {
80 | return appIds;
81 | }
82 |
83 | public void setAppIds(List appIds) {
84 | this.appIds = appIds;
85 | }
86 |
87 | public List getCategories() {
88 | return categories;
89 | }
90 |
91 | public void setCategories(List categories) {
92 | this.categories = categories;
93 | }
94 |
95 | public String getLanguage() {
96 | return language;
97 | }
98 |
99 | public void setLanguage(String language) {
100 | this.language = language;
101 | }
102 |
103 | public String getTextSignature() {
104 | return textSignature;
105 | }
106 |
107 | public void setTextSignature(String textSignature) {
108 | this.textSignature = textSignature;
109 | }
110 |
111 | @Override
112 | public String toString() {
113 | return "HttpArticle{" +
114 | "source='" + source + '\'' +
115 | ", language='" + language + '\'' +
116 | ", url='" + url + '\'' +
117 | ", title='" + title + '\'' +
118 | ", text='" + text + '\'' +
119 | ", textSignature='" + textSignature + '\'' +
120 | ", appIds=" + appIds +
121 | ", published=" + published +
122 | ", discovered=" + discovered +
123 | ", categories=" + categories +
124 | '}';
125 | }
126 | }
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticleParseResult.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 | import java.util.Collections;
5 | import java.util.List;
6 |
7 | public class HttpArticleParseResult implements Serializable {
8 |
9 | private HttpArticle article;
10 |
11 | private List titleMatches;
12 |
13 | private List textMatches;
14 |
15 | private List publishedTexts;
16 |
17 | private List publishedMatches;
18 |
19 | private String publishedPattern;
20 |
21 | public HttpArticleParseResult() {
22 | }
23 |
24 | public HttpArticleParseResult(HttpArticle article) {
25 | this.article = article;
26 | }
27 |
28 | public HttpArticle getArticle() {
29 | return article;
30 | }
31 |
32 | public void setArticle(HttpArticle article) {
33 | this.article = article;
34 | }
35 |
36 | public List getTitleMatches() {
37 | return titleMatches != null ? titleMatches : Collections.emptyList();
38 | }
39 |
40 | public void setTitleMatches(List titleMatches) {
41 | this.titleMatches = titleMatches;
42 | }
43 |
44 | public List getTextMatches() {
45 | return textMatches != null ? textMatches : Collections.emptyList();
46 | }
47 |
48 | public void setTextMatches(List textMatches) {
49 | this.textMatches = textMatches;
50 | }
51 |
52 | public List getPublishedTexts() {
53 | return publishedTexts != null ? publishedTexts : Collections.emptyList();
54 | }
55 |
56 | public void setPublishedTexts(List publishedTexts) {
57 | this.publishedTexts = publishedTexts;
58 | }
59 |
60 | public List getPublishedMatches() {
61 | return publishedMatches != null ? publishedMatches : Collections.emptyList();
62 | }
63 |
64 | public void setPublishedMatches(List publishedMatches) {
65 | this.publishedMatches = publishedMatches;
66 | }
67 |
68 | public String getPublishedPattern() {
69 | return publishedPattern;
70 | }
71 |
72 | public void setPublishedPattern(String publishedPattern) {
73 | this.publishedPattern = publishedPattern;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpSourceTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 |
5 | public class HttpSourceTest implements Serializable {
6 |
7 | private String source;
8 |
9 | private String url;
10 |
11 | private Boolean urlAccepted;
12 |
13 | private String html;
14 |
15 | private String title;
16 |
17 | private String text;
18 |
19 | private String date;
20 |
21 | public String getSource() {
22 | return source;
23 | }
24 |
25 | public void setSource(String source) {
26 | this.source = source;
27 | }
28 |
29 | public String getUrl() {
30 | return url;
31 | }
32 |
33 | public void setUrl(String url) {
34 | this.url = url;
35 | }
36 |
37 | public Boolean getUrlAccepted() {
38 | return urlAccepted;
39 | }
40 |
41 | public void setUrlAccepted(Boolean urlAccepted) {
42 | this.urlAccepted = urlAccepted;
43 | }
44 |
45 | public String getHtml() {
46 | return html;
47 | }
48 |
49 | public void setHtml(String html) {
50 | this.html = html;
51 | }
52 |
53 | public String getTitle() {
54 | return title;
55 | }
56 |
57 | public void setTitle(String title) {
58 | this.title = title;
59 | }
60 |
61 | public String getText() {
62 | return text;
63 | }
64 |
65 | public void setText(String text) {
66 | this.text = text;
67 | }
68 |
69 | public String getDate() {
70 | return date;
71 | }
72 |
73 | public void setDate(String date) {
74 | this.date = date;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/HttpUrl.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import org.joda.time.DateTime;
4 |
5 | import java.io.Serializable;
6 |
7 | public class HttpUrl implements Serializable {
8 |
9 | /**
10 | * Source url.
11 | */
12 | private String source;
13 |
14 | private String url;
15 |
16 | /**
17 | * Publish data when it is known before parsing. Eg. from RSS feed.
18 | */
19 | private String published;
20 |
21 | /**
22 | * When this url was dicovered.
23 | */
24 | private DateTime discovered;
25 |
26 | public String getSource() {
27 | return source;
28 | }
29 |
30 | public void setSource(String source) {
31 | this.source = source;
32 | }
33 |
34 | public String getUrl() {
35 | return url;
36 | }
37 |
38 | public void setUrl(String url) {
39 | this.url = url;
40 | }
41 |
42 | public String getPublished() {
43 | return published;
44 | }
45 |
46 | public void setPublished(String published) {
47 | this.published = published;
48 | }
49 |
50 | public DateTime getDiscovered() {
51 | return discovered;
52 | }
53 |
54 | public void setDiscovered(DateTime discovered) {
55 | this.discovered = discovered;
56 | }
57 | }
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/NamedQuery.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 |
5 | public class NamedQuery implements Serializable {
6 |
7 | private String name;
8 |
9 | private String stemmedCaseSensitive;
10 | private String stemmedCaseInSensitive;
11 | private String notStemmedCaseSensitive;
12 | private String notStemmedCaseInSensitive;
13 | private String advanced;
14 |
15 | public String getName() {
16 | return name;
17 | }
18 |
19 | public void setName(String name) {
20 | this.name = name;
21 | }
22 |
23 | public String getStemmedCaseSensitive() {
24 | return stemmedCaseSensitive;
25 | }
26 |
27 | public void setStemmedCaseSensitive(String stemmedCaseSensitive) {
28 | this.stemmedCaseSensitive = stemmedCaseSensitive;
29 | }
30 |
31 | public String getStemmedCaseInSensitive() {
32 | return stemmedCaseInSensitive;
33 | }
34 |
35 | public void setStemmedCaseInSensitive(String stemmedCaseInSensitive) {
36 | this.stemmedCaseInSensitive = stemmedCaseInSensitive;
37 | }
38 |
39 | public String getNotStemmedCaseSensitive() {
40 | return notStemmedCaseSensitive;
41 | }
42 |
43 | public void setNotStemmedCaseSensitive(String notStemmedCaseSensitive) {
44 | this.notStemmedCaseSensitive = notStemmedCaseSensitive;
45 | }
46 |
47 | public String getNotStemmedCaseInSensitive() {
48 | return notStemmedCaseInSensitive;
49 | }
50 |
51 | public void setNotStemmedCaseInSensitive(String notStemmedCaseInSensitive) {
52 | this.notStemmedCaseInSensitive = notStemmedCaseInSensitive;
53 | }
54 |
55 | public String getAdvanced() {
56 | return advanced;
57 | }
58 |
59 | public void setAdvanced(String advanced) {
60 | this.advanced = advanced;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/data-model/src/main/java/lt/tokenmill/crawling/data/PageableList.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import java.io.Serializable;
4 | import java.util.List;
5 |
6 | public class PageableList implements Serializable {
7 |
8 | private long totalCount;
9 |
10 | private List items;
11 |
12 | public long getTotalCount() {
13 | return totalCount;
14 | }
15 |
16 | public void setTotalCount(long totalCount) {
17 | this.totalCount = totalCount;
18 | }
19 |
20 | public List getItems() {
21 | return items;
22 | }
23 |
24 | public void setItems(List items) {
25 | this.items = items;
26 | }
27 |
28 | public static PageableList create(List items, long totalCount) {
29 | PageableList pageableList = new PageableList<>();
30 | pageableList.setItems(items);
31 | pageableList.setTotalCount(totalCount);
32 | return pageableList;
33 | }
34 | }
--------------------------------------------------------------------------------
/data-model/src/test/java/lt/tokenmill/crawling/data/DataUtilsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.data;
2 |
3 | import com.google.common.collect.Lists;
4 | import org.joda.time.DateTime;
5 | import org.junit.Test;
6 |
7 | import static org.junit.Assert.assertEquals;
8 |
9 | public class DataUtilsTest {
10 |
11 | @Test
12 | public void normalizerSplitter() {
13 | assertEquals(Lists.newArrayList("\\?.*$-->>", "a-->>b"),
14 | DataUtils.parseStringList("\\?.*$-->>\na-->>b\r\r\n\n"));
15 | }
16 |
17 | @Test
18 | public void dateFormatInUTC() {
19 | Long DATE_2017_01_04_12_26_00 = 1483532760805L;
20 | assertEquals("2017-01-04T12:26:00", DataUtils.formatInUTC(new DateTime(DATE_2017_01_04_12_26_00)));
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | elasticsearch:
5 | image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
6 | ports: ["9200:9200"]
7 | environment:
8 | discovery.type: single-node
9 | kibana:
10 | image: docker.elastic.co/kibana/kibana-oss:6.3.0
11 | ports: ["5601:5601"]
12 | environment:
13 | SERVER_NAME: kibana
14 | ELASTICSEARCH_URL: http://elasticsearch:9200
15 |
--------------------------------------------------------------------------------
/docker-compose.run.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | elasticsearch:
5 | image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest
6 | ports: ["9200:9200"]
7 | environment:
8 | discovery.type: single-node
9 | kibana:
10 | image: docker.elastic.co/kibana/kibana-oss:6.3.0
11 | ports: ["5601:5601"]
12 | environment:
13 | SERVER_NAME: kibana
14 | ELASTICSEARCH_URL: http://elasticsearch:9200
15 | administration-ui:
16 | image: registry.gitlab.com/tokenmill/crawling-framework/ui:latest
17 | ports: ["8081:8081"]
18 | crawler:
19 | image: registry.gitlab.com/tokenmill/crawling-framework/crawler:latest
20 | environment:
21 | DEFAULT_SOURCE_RELOAD_DELAY: 10
22 |
--------------------------------------------------------------------------------
/elasticsearch/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | crawling-framework
7 | lt.tokenmill.crawling
8 | 0.3.4-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | elasticsearch
13 |
14 |
15 |
16 |
17 | lt.tokenmill.crawling
18 | data-model
19 |
20 |
21 | org.elasticsearch
22 | elasticsearch
23 |
24 |
25 | org.elasticsearch.client
26 | transport
27 |
28 |
29 | org.elasticsearch.client
30 | elasticsearch-rest-client
31 | ${elasticsearch.version}
32 |
33 |
34 | org.elasticsearch.client
35 | elasticsearch-rest-high-level-client
36 | ${elasticsearch.version}
37 |
38 |
39 | org.apache.httpcomponents
40 | httpasyncclient
41 | 4.1.3
42 |
43 |
44 | org.apache.httpcomponents
45 | httpcore-nio
46 | 4.4.6
47 |
48 |
49 | org.apache.httpcomponents
50 | httpclient
51 | 4.5.4
52 |
53 |
54 | org.apache.httpcomponents
55 | httpcore
56 | 4.4.6
57 |
58 |
59 | org.elasticsearch.plugin
60 | transport-netty4-client
61 | ${elasticsearch.version}
62 | test
63 |
64 |
65 | com.google.guava
66 | guava
67 |
68 |
69 | org.apache.logging.log4j
70 | log4j-api
71 | 2.7
72 | provided
73 |
74 |
75 | org.apache.logging.log4j
76 | log4j-core
77 | 2.13.2
78 | provided
79 |
80 |
81 | org.slf4j
82 | slf4j-log4j12
83 | ${slf4j.version}
84 | provided
85 |
86 |
87 | junit
88 | junit
89 | 4.13.1
90 | test
91 |
92 |
93 |
94 |
95 |
96 | release
97 |
98 |
99 |
100 | org.apache.maven.plugins
101 | maven-source-plugin
102 |
103 |
104 |
105 | org.apache.maven.plugins
106 | maven-jar-plugin
107 |
108 |
109 |
110 | org.apache.maven.plugins
111 | maven-javadoc-plugin
112 |
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import org.elasticsearch.client.RequestOptions;
4 | import org.slf4j.Logger;
5 | import org.slf4j.LoggerFactory;
6 |
7 | import java.net.URLEncoder;
8 | import java.util.UUID;
9 |
10 | public class BaseElasticOps {
11 |
12 | protected final Logger LOG = LoggerFactory.getLogger(this.getClass());
13 |
14 | private final RequestOptions requestOptions;
15 | private ElasticConnection connection;
16 | private String index;
17 | private String type;
18 |
19 | protected BaseElasticOps(ElasticConnection connection, String index, String type) {
20 | this.connection = connection;
21 | this.index = index;
22 | this.type = type;
23 | requestOptions = RequestOptions.DEFAULT;
24 | }
25 |
26 | protected ElasticConnection getConnection() {
27 | return connection;
28 | }
29 |
30 | protected String getIndex() {
31 | return index;
32 | }
33 |
34 | protected String getType() {
35 | return type;
36 | }
37 |
38 | protected RequestOptions getRequestOptions() { return requestOptions; }
39 |
40 | public void close() {
41 | if (connection != null) {
42 | connection.close();
43 | }
44 | }
45 |
46 | protected static String formatId(String url) {
47 | try {
48 | String urlId = URLEncoder.encode(url.toLowerCase(), "utf-8");
49 | if (urlId.length() > 511) {
50 | urlId = urlId.substring(0, 511);
51 | }
52 | return urlId;
53 | } catch (Exception e) {
54 | e.printStackTrace();
55 | }
56 | return UUID.randomUUID().toString();
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/ElasticConstants.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | public class ElasticConstants {
4 |
5 | public static final String ES_HOSTNAME_PARAM = "es.hostname";
6 | public static final String ES_REST_PORT = "es.rest.port";
7 | public static final String ES_REST_SCHEME = "es.rest.scheme";
8 |
9 | public static final String ES_URLS_INDEX_NAME_PARAM = "es.urls.index.name";
10 | public static final String ES_URLS_DOC_TYPE_PARAM = "es.urls.doc.type";
11 |
12 | public static final String ES_DOCS_INDEX_NAME_PARAM = "es.docs.index.name";
13 | public static final String ES_DOCS_DOC_TYPE_PARAM = "es.docs.doc.type";
14 |
15 | public static final String ES_HTTP_SOURCES_INDEX_NAME_PARAM = "es.httpsource.index.name";
16 | public static final String ES_HTTP_SOURCES_DOC_TYPE_PARAM = "es.httpsource.doc.type";
17 |
18 | public static final String ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM = "es.httpsourcetest.index.name";
19 | public static final String ES_HTTP_SOURCES_TEST_TYPE_PARAM = "es.httpsourcetest.doc.type";
20 |
21 | public static final String ES_NAMED_QUERIES_INDEX_PARAM = "es.namedqueries.index.name";
22 | public static final String ES_NAMED_QUERIES_TYPE_PARAM = "es.namedqueries.doc.type";
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDataParser.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import com.google.common.collect.Lists;
4 | import org.joda.time.DateTime;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import java.text.ParseException;
9 | import java.text.SimpleDateFormat;
10 | import java.util.List;
11 | import java.util.TimeZone;
12 |
13 | class EsDataParser {
14 |
15 | private static final Logger LOG = LoggerFactory.getLogger(EsDataParser.class);
16 |
17 |
18 | private static final List ES_DATE_TIME_FORMATS = Lists.newArrayList(
19 | "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'",
20 | "yyyy-MM-dd'T'HH:mm:ss'Z'"
21 | );
22 |
23 | static DateTime nullOrDate(Object object) {
24 | if (object != null) {
25 | DateTime result = null;
26 | for (String format : ES_DATE_TIME_FORMATS) {
27 | SimpleDateFormat formatter = new SimpleDateFormat(format);
28 | formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
29 | try {
30 | result = new DateTime(formatter.parse(object.toString()));
31 | break;
32 | } catch (ParseException ignored) {
33 | }
34 | }
35 | if (result == null) {
36 | LOG.error("Failed to parse date from '{}'", object);
37 | }
38 | return result;
39 | }
40 | return null;
41 | }
42 |
43 | static boolean falseOrBoolean(Object object) {
44 | return (object != null) && Boolean.parseBoolean(object.toString());
45 | }
46 | }
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourcesCache.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import com.google.common.cache.CacheBuilder;
4 | import com.google.common.cache.CacheLoader;
5 | import com.google.common.cache.LoadingCache;
6 | import lt.tokenmill.crawling.data.HttpSource;
7 |
8 | import java.util.concurrent.ExecutionException;
9 | import java.util.concurrent.TimeUnit;
10 |
11 | public class EsHttpSourcesCache {
12 |
13 |
14 | private static LoadingCache INSTANCE;
15 |
16 | private static synchronized LoadingCache getInstance(
17 | final EsHttpSourceOperations operations) {
18 | if (INSTANCE == null) {
19 | INSTANCE = CacheBuilder.newBuilder()
20 | .maximumSize(1000)
21 | .expireAfterWrite(10, TimeUnit.MINUTES)
22 | .build(new CacheLoader() {
23 | public HttpSource load(String url) {
24 | return operations.get(url);
25 | }
26 | });
27 | }
28 | return INSTANCE;
29 | }
30 |
31 | public static HttpSource get(EsHttpSourceOperations operations, String source) {
32 | try {
33 | return getInstance(operations).get(source);
34 | } catch (ExecutionException e) {
35 | throw new RuntimeException(e);
36 | }
37 | }
38 |
39 | public static void invalidate() {
40 | if (INSTANCE != null) {
41 | INSTANCE.invalidateAll();
42 | }
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/Utils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import com.google.common.base.Joiner;
4 | import com.google.common.base.Splitter;
5 | import org.joda.time.DateTime;
6 |
7 | import java.util.Collection;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Objects;
11 | import java.util.stream.Collectors;
12 |
13 | import static com.google.common.base.Strings.isNullOrEmpty;
14 |
15 | public class Utils {
16 |
17 | private static final Splitter LINE_SPLITTER = Splitter.on('\n');
18 | private static final Joiner LINE_JOINER = Joiner.on('\n');
19 |
20 | public static List linesToList(String text) {
21 | return LINE_SPLITTER.splitToList(text).stream()
22 | .map(String::trim)
23 | .filter(l -> !isNullOrEmpty(l))
24 | .collect(Collectors.toList());
25 | }
26 |
27 | public static String listToText(List lines) {
28 | return lines != null ? LINE_JOINER.join(lines) : "";
29 | }
30 |
31 | public static Object formatFieldValue(Object value) {
32 | if (value == null) {
33 | return null;
34 | }
35 | if (value instanceof List) {
36 | List v = (List) value;
37 | if (!v.isEmpty() && (v.get(0) instanceof Map)) {
38 | return v;
39 | }
40 | return listToText((List) value);
41 | } else if (value instanceof DateTime) {
42 | return ((DateTime) value).toDate();
43 | } else if (value instanceof Enum) {
44 | return Objects.toString(value, null);
45 | } else {
46 | return value;
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/java/lt/tokenmill/crawling/es/model/DateHistogramValue.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es.model;
2 |
3 | public class DateHistogramValue {
4 |
5 | private Long value;
6 |
7 | private String date;
8 |
9 | public DateHistogramValue(String date, Long value) {
10 | this.value = value;
11 | this.date = date;
12 | }
13 |
14 | public Long getValue() {
15 | return value;
16 | }
17 |
18 | public void setValue(Long value) {
19 | this.value = value;
20 | }
21 |
22 | public String getDate() {
23 | return date;
24 | }
25 |
26 | public void setDate(String date) {
27 | this.date = date;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/document.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "number_of_shards": 1,
4 | "number_of_replicas": 0,
5 | "index": {
6 | "codec": "best_compression"
7 | },
8 | "analysis": {
9 | "filter": {
10 | "english_stop": {
11 | "type": "stop",
12 | "stopwords": "_english_"
13 | },
14 | "english_light_stemmer": {
15 | "type": "stemmer",
16 | "language": "light_english"
17 | },
18 | "english_possessive_stemmer": {
19 | "type": "stemmer",
20 | "language": "possessive_english"
21 | }
22 | },
23 | "analyzer": {
24 | "english_stem_cs": {
25 | "tokenizer": "standard",
26 | "filter": [
27 | "english_possessive_stemmer",
28 | "english_stop",
29 | "english_light_stemmer"
30 | ]
31 | },
32 | "english_stem_ci": {
33 | "tokenizer": "standard",
34 | "filter": [
35 | "english_possessive_stemmer",
36 | "lowercase",
37 | "english_stop",
38 | "english_light_stemmer"
39 | ]
40 | },
41 | "english_nostem_cs": {
42 | "tokenizer": "standard",
43 | "filter": [
44 | "english_possessive_stemmer",
45 | "english_stop"
46 | ]
47 | },
48 | "english_nostem_ci": {
49 | "tokenizer": "standard",
50 | "filter": [
51 | "english_possessive_stemmer",
52 | "lowercase",
53 | "english_stop"
54 | ]
55 | }
56 | }
57 | }
58 | },
59 | "mappings": {
60 | "doc": {
61 | "_source": {
62 | "enabled": true
63 | },
64 | "properties": {
65 | "created": {
66 | "type": "date"
67 | },
68 | "published": {
69 | "type": "date"
70 | },
71 | "discovered": {
72 | "type": "date"
73 | },
74 | "updated": {
75 | "type": "date"
76 | },
77 | "url": {
78 | "type": "keyword"
79 | },
80 | "source": {
81 | "type": "keyword"
82 | },
83 | "language": {
84 | "type": "keyword"
85 | },
86 | "status": {
87 | "type": "keyword"
88 | },
89 | "app_ids": {
90 | "type": "keyword"
91 | },
92 | "categories": {
93 | "type": "keyword"
94 | },
95 | "title": {
96 | "type": "text",
97 | "index": true,
98 | "doc_values": false,
99 | "fielddata": true,
100 | "fields": {
101 | "stem_cs": {
102 | "type": "text",
103 | "index": true,
104 | "analyzer": "english_stem_cs"
105 | },
106 | "stem_ci": {
107 | "type": "text",
108 | "index": true,
109 | "analyzer": "english_stem_ci"
110 | },
111 | "nostem_cs": {
112 | "type": "text",
113 | "index": true,
114 | "analyzer": "english_nostem_cs"
115 | },
116 | "nostem_ci": {
117 | "type": "text",
118 | "index": true,
119 | "analyzer": "english_nostem_ci"
120 | }
121 | }
122 | },
123 | "text": {
124 | "type": "text",
125 | "doc_values": false,
126 | "fielddata": true,
127 | "fields": {
128 | "stem_cs": {
129 | "type": "text",
130 | "index": true,
131 | "analyzer": "english_stem_cs"
132 | },
133 | "stem_ci": {
134 | "type": "text",
135 | "index": true,
136 | "analyzer": "english_stem_ci"
137 | },
138 | "nostem_cs": {
139 | "type": "text",
140 | "index": true,
141 | "analyzer": "english_nostem_cs"
142 | },
143 | "nostem_ci": {
144 | "type": "text",
145 | "index": true,
146 | "analyzer": "english_nostem_ci"
147 | }
148 | }
149 | },
150 | "text_signature": {
151 | "type": "keyword"
152 | },
153 | "duplicate_of": {
154 | "type": "keyword"
155 | }
156 | }
157 | }
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/http_source.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "number_of_shards": 1,
4 | "number_of_replicas": 0,
5 | "index": {
6 | "codec": "best_compression"
7 | }
8 | },
9 | "mappings": {
10 | "http_source": {
11 | "_source": {
12 | "enabled": true
13 | },
14 | "properties": {
15 | "created": {
16 | "type": "date",
17 | "format": "date_optional_time"
18 | },
19 | "updated": {
20 | "type": "date",
21 | "format": "date_optional_time"
22 | },
23 | "url": {
24 | "type": "keyword",
25 | "copy_to": "search_field"
26 | },
27 | "name": {
28 | "type": "keyword",
29 | "copy_to": "search_field"
30 | },
31 | "timezone": {
32 | "type": "keyword"
33 | },
34 | "language": {
35 | "type": "keyword"
36 | },
37 | "url_crawl_delay_secs": {
38 | "type": "integer"
39 | },
40 | "feed_crawl_delay_secs": {
41 | "type": "integer"
42 | },
43 | "sitemap_crawl_delay_secs": {
44 | "type": "integer"
45 | },
46 | "enabled": {
47 | "type": "boolean"
48 | },
49 | "discovery_enabled": {
50 | "type": "boolean"
51 | },
52 | "urls": {
53 | "type": "keyword",
54 | "copy_to": "search_field"
55 | },
56 | "sitemaps": {
57 | "type": "keyword",
58 | "copy_to": "search_field"
59 | },
60 | "feeds": {
61 | "type": "keyword",
62 | "copy_to": "search_field"
63 | },
64 | "countries": {
65 | "type": "keyword"
66 | },
67 | "categories": {
68 | "type": "keyword"
69 | },
70 | "app_ids": {
71 | "type": "keyword"
72 | },
73 | "url_filters": {
74 | "type": "keyword"
75 | },
76 | "url_normalizers": {
77 | "type": "keyword"
78 | },
79 | "title_selectors": {
80 | "type": "keyword"
81 | },
82 | "date_selectors": {
83 | "type": "keyword"
84 | },
85 | "text_selectors": {
86 | "type": "keyword"
87 | },
88 | "text_normalizers": {
89 | "type": "keyword"
90 | },
91 | "date_regexps": {
92 | "type": "keyword"
93 | },
94 | "date_formats": {
95 | "type": "keyword"
96 | },
97 | "search_field": {
98 | "type": "text"
99 | }
100 | }
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/http_source_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "number_of_shards": 1,
4 | "number_of_replicas": 0,
5 | "index": {
6 | "codec": "best_compression"
7 | }
8 | },
9 | "mappings": {
10 | "http_source_test": {
11 | "_source": {
12 | "enabled": true
13 | },
14 | "properties": {
15 | "updated": {
16 | "type": "date",
17 | "format": "date_optional_time"
18 | },
19 | "source_url": {
20 | "type": "keyword",
21 | "copy_to": "search_field"
22 | },
23 | "url": {
24 | "type": "keyword",
25 | "copy_to": "search_field"
26 | },
27 | "url_accepted": {
28 | "type": "boolean",
29 | "doc_values": false
30 | },
31 | "html": {
32 | "type": "keyword",
33 | "index": false,
34 | "doc_values": false
35 | },
36 | "title": {
37 | "type": "keyword",
38 | "index": false,
39 | "doc_values": false
40 | },
41 | "text": {
42 | "type": "keyword",
43 | "index": false,
44 | "doc_values": false
45 | },
46 | "date": {
47 | "type": "keyword",
48 | "index": false,
49 | "doc_values": false
50 | },
51 | "search_field": {
52 | "type": "text"
53 | }
54 | }
55 | }
56 | }
57 | }
--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "number_of_shards": 1,
4 | "number_of_replicas": 0,
5 | "index": {
6 | "codec": "best_compression"
7 | }
8 | },
9 | "mappings": {
10 | "named_query": {
11 | "_source": {
12 | "enabled": true
13 | },
14 | "properties": {
15 | "updated": {
16 | "type": "date",
17 | "format": "date_optional_time"
18 | },
19 | "name": {
20 | "type": "keyword"
21 | },
22 | "name_suggest": {
23 | "type": "completion"
24 | },
25 | "stemmed_case_sensitive": {
26 | "type": "keyword"
27 | },
28 | "stemmed_case_insensitive": {
29 | "type": "keyword"
30 | },
31 | "not_stemmed_case_sensitive": {
32 | "type": "keyword"
33 | },
34 | "not_stemmed_case_insensitive": {
35 | "type": "keyword"
36 | },
37 | "advanced": {
38 | "type": "keyword"
39 | }
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/elasticsearch/src/main/resources/indices/url.json:
--------------------------------------------------------------------------------
1 | {
2 | "settings": {
3 | "number_of_shards": 1,
4 | "number_of_replicas": 0,
5 | "index": {
6 | "codec": "best_compression"
7 | }
8 | },
9 | "mappings": {
10 | "url": {
11 | "_source": {
12 | "enabled": true
13 | },
14 | "properties": {
15 | "created": {
16 | "type": "date"
17 | },
18 | "updated": {
19 | "type": "date"
20 | },
21 | "published": {
22 | "type": "date"
23 | },
24 | "url": {
25 | "type": "keyword"
26 | },
27 | "source": {
28 | "type": "keyword"
29 | },
30 | "status": {
31 | "type": "keyword"
32 | }
33 | }
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticConnectionTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import org.apache.http.HttpHost;
4 | import org.elasticsearch.action.DocWriteRequest;
5 | import org.elasticsearch.action.bulk.BulkItemResponse;
6 | import org.elasticsearch.action.bulk.BulkProcessor;
7 | import org.elasticsearch.action.bulk.BulkRequest;
8 | import org.elasticsearch.action.bulk.BulkResponse;
9 | import org.elasticsearch.action.index.IndexRequest;
10 | import org.elasticsearch.action.update.UpdateRequest;
11 | import org.elasticsearch.client.RestClient;
12 | import org.elasticsearch.client.RestClientBuilder;
13 | import org.elasticsearch.client.RestHighLevelClient;
14 | import org.elasticsearch.common.unit.TimeValue;
15 | import org.junit.Test;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | import java.io.UnsupportedEncodingException;
20 | import java.net.URLDecoder;
21 |
22 | import static org.junit.Assert.assertNotNull;
23 |
24 | public class ElasticConnectionTest {
25 | private static final Logger LOG = LoggerFactory.getLogger(ElasticConnectionTest.class);
26 | @Test
27 | public void testConnectionBuilder() {
28 | ElasticConnection connection = ElasticConnection.builder().build();
29 | assertNotNull(connection.getRestHighLevelClient());
30 | }
31 |
32 | @Test
33 | public void testBuilder() {
34 | BulkProcessor.Listener listener = new BulkProcessor.Listener() {
35 |
36 | @Override
37 | public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
38 | for (BulkItemResponse item : response.getItems()) {
39 | if (item.isFailed()) {
40 | LOG.error("Bulk item failure: '{}' for request '{}'",
41 | item.getFailure(), request.requests().get(item.getItemId()));
42 | }
43 | }
44 | }
45 |
46 | @Override
47 | public void afterBulk(long executionId, BulkRequest request, Throwable response) {
48 | LOG.error("Bulk failed:" + response);
49 | }
50 |
51 | @Override
52 | public void beforeBulk(long executionId, BulkRequest request) {
53 | for (DocWriteRequest r :request.requests()) {
54 | try {
55 | if (r instanceof IndexRequest) {
56 | IndexRequest indexRequest = (IndexRequest) r;
57 | indexRequest.id(URLDecoder.decode(indexRequest.id(), "utf-8"));
58 |
59 | } else if (r instanceof UpdateRequest) {
60 | UpdateRequest updateRequest = (UpdateRequest) r;
61 | updateRequest.id(URLDecoder.decode(updateRequest.id(), "utf-8"));
62 | }
63 | } catch (UnsupportedEncodingException e) {
64 | e.printStackTrace();
65 | }
66 | }
67 | }
68 | };
69 | ElasticConnection connection = ElasticConnection.builder()
70 | .hostname("0.0.0.0")
71 | .restPort(443)
72 | .restScheme("https")
73 | .bulkActions(1)
74 | .flushIntervalString("1s")
75 | .listener(listener)
76 | .build();
77 | assertNotNull(connection);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticsearchTestServer.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import org.elasticsearch.client.Client;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.node.Node;
7 | import org.elasticsearch.node.NodeValidationException;
8 | import org.elasticsearch.plugins.Plugin;
9 | import org.elasticsearch.transport.Netty4Plugin;
10 |
11 | import java.io.File;
12 | import java.io.IOException;
13 | import java.nio.file.FileVisitOption;
14 | import java.nio.file.Files;
15 | import java.nio.file.Path;
16 | import java.nio.file.Paths;
17 | import java.util.Arrays;
18 | import java.util.Collection;
19 | import java.util.Comparator;
20 |
21 | public class ElasticsearchTestServer {
22 |
23 | private static class MyNode extends Node {
24 | MyNode(Settings preparedSettings, Collection> classpathPlugins) {
25 | super(new Environment(preparedSettings, null), classpathPlugins, false);
26 | }
27 | }
28 |
29 | private final Node node;
30 | private Client client;
31 |
32 | private ElasticsearchTestServer(Builder builder) {
33 | if (builder.cleanDataDir) {
34 | try {
35 | Path rootPath = Paths.get(builder.dataDirectory);
36 | if (Files.exists(rootPath)) {
37 | Files.walk(rootPath, FileVisitOption.FOLLOW_LINKS)
38 | .sorted(Comparator.reverseOrder())
39 | .map(Path::toFile)
40 | .forEach(File::delete);
41 | }
42 | } catch (IOException e) {
43 | e.printStackTrace();
44 | }
45 | }
46 | Settings settings = Settings.builder()
47 | .put("client.transport.ignore_cluster_name", true)
48 | .put("transport.type", "netty4")
49 | .put("http.type", "netty4")
50 | .put("http.enabled", "true")
51 | .put("http.port", builder.httpPort)
52 | .put("path.home", builder.dataDirectory)
53 | .put("transport.tcp.port", builder.transportPort)
54 | .build();
55 | this.node = new MyNode(settings, Arrays.asList(Netty4Plugin.class));
56 | }
57 |
58 | public void start() {
59 | try {
60 | this.node.start();
61 | this.client = this.node.client();
62 | } catch (NodeValidationException e) {
63 | e.printStackTrace();
64 | }
65 | }
66 |
67 | public void stop() {
68 | try {
69 | this.client.close();
70 | this.node.close();
71 | } catch (IOException e) {
72 | e.printStackTrace();
73 | }
74 | }
75 |
76 | public static Builder builder() {
77 | return new Builder();
78 | }
79 |
80 |
81 | public static class Builder {
82 |
83 | private boolean cleanDataDir = true;
84 | private String dataDirectory = "target/elasticsearch-data";
85 | private int httpPort = 9200;
86 | private int transportPort = 9305;
87 |
88 | public Builder httpPort(int httpPort) {
89 | this.httpPort = httpPort;
90 | return this;
91 | }
92 |
93 | public Builder transportPort(int transportPort) {
94 | this.transportPort = transportPort;
95 | return this;
96 | }
97 |
98 | public ElasticsearchTestServer build() {
99 | return new ElasticsearchTestServer(this);
100 | }
101 |
102 |
103 | public Builder dataDirectory(String dataDirectory) {
104 | this.dataDirectory = dataDirectory;
105 | return this;
106 | }
107 |
108 | public Builder cleanDataDir(boolean cleanDataDir) {
109 | this.cleanDataDir = cleanDataDir;
110 | return this;
111 | }
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsDocumentOperationsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import com.google.common.collect.ImmutableMap;
4 | import lt.tokenmill.crawling.data.HttpArticle;
5 | import org.joda.time.DateTime;
6 | import org.junit.Ignore;
7 | import org.junit.Test;
8 |
9 | import java.util.Arrays;
10 | import java.util.Map;
11 |
12 | import static org.junit.Assert.assertEquals;
13 | import static org.junit.Assert.assertNull;
14 |
15 | public class EsDocumentOperationsTest {
16 |
17 | @Test
18 | @Ignore
19 | public void test() throws InterruptedException {
20 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
21 | EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "demo-docs", "doc");
22 | HttpArticle article = new HttpArticle();
23 | article.setUrl("http://www.bbc.com/news/science-environment-43727547");
24 | article.setTitle("title");
25 | article.setText("text");
26 | article.setPublished(DateTime.now());
27 |
28 | esDocumentOperations.store(article);
29 |
30 | Thread.sleep(6000);
31 |
32 | HttpArticle httpArticle = esDocumentOperations.get(article.getUrl());
33 | assertEquals(article.getUrl(), httpArticle.getUrl());
34 | assertEquals(article.getText(), httpArticle.getText());
35 |
36 | esDocumentOperations.update(article, ImmutableMap.of("TESTKEY", Arrays.asList(ImmutableMap.of("k1", "v1"))));
37 | Thread.sleep(6000);
38 | Map articleMap = esDocumentOperations.getAsMap(article.getUrl());
39 | assertEquals(article.getText(), articleMap.get("text"));
40 | assertEquals("TESTVAL", articleMap.get("TESTKEY"));
41 | }
42 |
43 | @Test
44 | @Ignore
45 | public void testDuplicateFinder() throws InterruptedException {
46 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
47 | EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "cf-docs", "doc");
48 | HttpArticle article = new HttpArticle();
49 | article.setUrl("url1");
50 | article.setSource("source");
51 | article.setTitle("title");
52 | article.setText("text");
53 | article.setTextSignature("text_signature");
54 | article.setPublished(DateTime.now());
55 | esDocumentOperations.store(article);
56 | Thread.sleep(6000);
57 | HttpArticle duplicate = esDocumentOperations.findDuplicate(article);
58 | assertNull(duplicate);
59 | article.setUrl("url2");
60 | esDocumentOperations.store(article);
61 | Thread.sleep(6000);
62 | assertEquals("url1", esDocumentOperations.getAsMap("url2").get("duplicate_of"));
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpSourceOperationsTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import lt.tokenmill.crawling.data.HttpSource;
4 | import lt.tokenmill.crawling.data.PageableList;
5 | import org.junit.Ignore;
6 | import org.junit.Test;
7 |
8 | import static org.junit.Assert.*;
9 |
10 | public class EsHttpSourceOperationsTest {
11 |
12 | @Test
13 | @Ignore
14 | public void test() {
15 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
16 | EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "demo-http_sources", "http_source");
17 | PageableList data = esHttpSourceOperations.filter(null);
18 | for (HttpSource source : data.getItems()) {
19 | System.out.println(">>" + source);
20 | }
21 | }
22 |
23 | @Ignore
24 | @Test
25 | public void testRefresh() {
26 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http");
27 | EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "cf-http_sources", "http_source");
28 | HttpSource source = new HttpSource();
29 | source.setName("test");
30 | source.setUrl("url");
31 | esHttpSourceOperations.save(source);
32 | String currentName = esHttpSourceOperations.get("url").getName();
33 | assertEquals("test", currentName);
34 | source.setName("new name");
35 | esHttpSourceOperations.save(source);
36 | String name = esHttpSourceOperations.get("url").getName();
37 | assertNotEquals("test", name);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpUrlOperationsTestInt.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import lt.tokenmill.crawling.data.HttpUrl;
4 | import org.junit.Test;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import java.io.IOException;
9 | import java.util.List;
10 |
11 | import static junit.framework.TestCase.assertTrue;
12 |
13 | public class EsHttpUrlOperationsTestInt {
14 |
15 | private static final Logger LOG = LoggerFactory.getLogger(EsHttpUrlOperationsTestInt.class);
16 |
17 | private static final String ES_TEST_HOST = "elasticsearch";
18 | private static final int ES_HTTP_TEST_PORT = 9200;
19 | private static final String ES_REST_TEST_SCHEME = "http";
20 | private static final String INDEX_ALIAS = "urls";
21 | private static final String DOC_TYPE = "url";
22 |
23 |
24 | @Test
25 | public void testEsHttpSourceOperations000() throws IOException, InterruptedException {
26 | ElasticConnection connection = ElasticConnection.getConnection(ES_TEST_HOST, ES_HTTP_TEST_PORT, ES_REST_TEST_SCHEME);
27 | EsHttpUrlOperations esHttpUrlOperations = EsHttpUrlOperations.getInstance(connection, INDEX_ALIAS, DOC_TYPE);
28 |
29 | String url = "http://www.bbc.com/news/science-environment-43727547";
30 | String source = "www.bbc.com";
31 | esHttpUrlOperations.upsertUrlStatus(url, null, source, true, "a");
32 | Thread.sleep(6000);
33 | esHttpUrlOperations.upsertUrlStatus(url, null, source, false, "b");
34 | Thread.sleep(6000);
35 | List urls = esHttpUrlOperations.findUrlsByStatusAndSource("b", source, 10);
36 | assertTrue(urls.size() > 0);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/java/lt/tokenmill/crawling/es/TestUtils.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.es;
2 |
3 | import java.io.IOException;
4 | import java.net.URISyntaxException;
5 | import java.nio.charset.StandardCharsets;
6 | import java.nio.file.Files;
7 | import java.nio.file.Paths;
8 |
9 | public class TestUtils {
10 |
11 | public static byte[] readResourceAsBytes(String filename) throws URISyntaxException, IOException {
12 | return Files.readAllBytes(Paths.get(TestUtils.class.getClassLoader().getResource(filename).toURI()));
13 | }
14 |
15 | public static String readResourceAsString(String filename) throws URISyntaxException, IOException {
16 | return new String(readResourceAsBytes(filename), StandardCharsets.UTF_8);
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 |
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Target=System.out
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{5}:%L - %m%n
7 |
--------------------------------------------------------------------------------
/elasticsearch/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | name=PropertiesConfig
2 | property.filename = logs
3 | appenders = console
4 | appender.console.type = Console
5 | appender.console.name = STDOUT
6 | appender.console.layout.type = PatternLayout
7 | appender.console.layout.pattern = [%-5level] %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %c{5} - %msg%n
8 |
9 | rootLogger.level = WARN
10 | rootLogger.appenderRefs = stdout
11 | rootLogger.appenderRef.stdout.ref = STDOUT
12 |
13 | appender.org.elasticsearch = debug
--------------------------------------------------------------------------------
/page-analyzer/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | crawling-framework
7 | lt.tokenmill.crawling
8 | 0.3.4-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | page-analyzer
13 |
14 |
15 |
16 | lt.tokenmill.crawling
17 | data-model
18 | ${project.version}
19 |
20 |
21 | org.jsoup
22 | jsoup
23 |
24 |
25 | com.google.guava
26 | guava
27 |
28 |
29 | com.mashape.unirest
30 | unirest-java
31 | 1.4.9
32 |
33 |
34 | com.github.crawler-commons
35 | crawler-commons
36 | 0.7
37 |
38 |
39 | org.slf4j
40 | slf4j-log4j12
41 | ${slf4j.version}
42 | provided
43 |
44 |
45 | junit
46 | junit
47 | 4.13.1
48 | test
49 |
50 |
51 |
52 |
53 |
54 | release
55 |
56 |
57 |
58 | org.apache.maven.plugins
59 | maven-source-plugin
60 |
61 |
62 |
63 | org.apache.maven.plugins
64 | maven-jar-plugin
65 |
66 |
67 |
68 | org.apache.maven.plugins
69 | maven-javadoc-plugin
70 |
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/page-analyzer/src/main/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzer.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.pageanalyzer;
2 |
3 | import com.google.common.base.Joiner;
4 | import com.google.common.collect.Maps;
5 | import com.mashape.unirest.http.HttpResponse;
6 | import com.mashape.unirest.http.Unirest;
7 | import com.mashape.unirest.http.exceptions.UnirestException;
8 | import crawlercommons.robots.BaseRobotRules;
9 | import crawlercommons.robots.SimpleRobotRulesParser;
10 | import lt.tokenmill.crawling.data.HtmlAnalysisResult;
11 | import org.jsoup.Jsoup;
12 | import org.jsoup.nodes.Document;
13 | import org.jsoup.nodes.Element;
14 |
15 | import java.net.URL;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.stream.Collectors;
19 |
20 | public class PageAnalyzer {
21 |
22 | private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
23 |
24 | public static final String CONFIG_USER_AGENT = "UserAgent";
25 | public static final String CONFIG_ANALYZE_ROBOTS_TXT = "RobotsTxt";
26 |
27 | public static HtmlAnalysisResult analyze(Map config, String url) {
28 | try {
29 | String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT);
30 | HttpResponse response = Unirest.get(url)
31 | .header("User-Agent", userAgent)
32 | .asString();
33 | return analyze(config, url, response.getBody(), response.getStatus(), response.getHeaders());
34 | } catch (UnirestException e) {
35 | throw new RuntimeException(e);
36 | }
37 | }
38 |
39 | public static HtmlAnalysisResult analyze(Map config, String url, String html) {
40 | return analyze(config, url, html, null, Maps.newHashMap());
41 | }
42 |
43 | public static HtmlAnalysisResult analyze(Map config, String url, String html, Integer status, Map> headers) {
44 | try {
45 | HtmlAnalysisResult result = new HtmlAnalysisResult();
46 | result.setUrl(url);
47 | result.setHttpStatus(status);
48 | result.setHeaders(headers.entrySet()
49 | .stream()
50 | .collect(Collectors.toMap(Map.Entry::getKey, e -> Joiner.on("\n").join(e.getValue()))));
51 |
52 | Document document = Jsoup.parse(html, url);
53 | result.setTitle(document.title());
54 |
55 | List meta = document.select("meta").stream().map(Element::toString).collect(Collectors.toList());
56 | result.setMetaValues(meta);
57 |
58 | List links = document.select("a").stream().map(e -> e.attr("abs:href")).collect(Collectors.toList());
59 | result.setLinks(links);
60 |
61 | if (Boolean.parseBoolean(config.get(CONFIG_ANALYZE_ROBOTS_TXT))) {
62 | String robotsUrl = robotsTxtUrl(url);
63 | String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT);
64 | HttpResponse response = Unirest.get(robotsUrl)
65 | .header("User-Agent", userAgent)
66 | .asString();
67 | String robotsTxt = response.getBody();
68 | parseRobotsTxt(userAgent, robotsUrl, robotsTxt, result);
69 | }
70 | return result;
71 | } catch (Exception e) {
72 | throw new RuntimeException(e);
73 | }
74 | }
75 |
76 | public static void parseRobotsTxt(String userAgent, String robotsUrl, String robotsTxt, HtmlAnalysisResult result) {
77 | result.setRobotsTxt(robotsTxt);
78 | SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
79 | BaseRobotRules robotRules = robotsParser.parseContent(robotsUrl, robotsTxt.getBytes(), null, userAgent);
80 | result.setRobotsAllowedAll(robotRules.isAllowAll());
81 | result.setRobotsAllowedNone(robotRules.isAllowNone());
82 | result.setRobotsAllowedHome(robotRules.isAllowed("/"));
83 | result.setRobotsSitemaps(robotRules.getSitemaps());
84 | result.setRobotsCrawlDelay(robotRules.getCrawlDelay());
85 | }
86 |
87 | private static String robotsTxtUrl(String url) {
88 | try {
89 | URL urlObject = new URL(url);
90 | String portPart = urlObject.getPort() > 0 ? ":" + urlObject.getPort() : "";
91 | return String.format("%s://%s%s/robots.txt", urlObject.getProtocol(),
92 | urlObject.getHost(), portPart);
93 | } catch (Exception e) {
94 | throw new RuntimeException(e);
95 | }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/page-analyzer/src/test/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzerTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.pageanalyzer;
2 |
3 | import com.google.common.base.Charsets;
4 | import com.google.common.collect.Lists;
5 | import com.google.common.collect.Maps;
6 | import com.google.common.io.Resources;
7 | import lt.tokenmill.crawling.data.HtmlAnalysisResult;
8 | import org.junit.Ignore;
9 | import org.junit.Test;
10 |
11 | import java.net.URL;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.assertFalse;
18 | import static org.junit.Assert.assertTrue;
19 |
20 | public class PageAnalyzerTest {
21 |
22 | @Test
23 | public void headersAndStatus() {
24 | Map> headers = Maps.newHashMap();
25 | headers.put("Etag", Lists.newArrayList("c1dc8d7be85325149", "ed5fc4d62b84752"));
26 | headers.put("Date", Lists.newArrayList("Wed, 11 Jan 2017 13:00:18 GMT"));
27 | HashMap config = Maps.newHashMap();
28 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://example.org", "", 200, headers);
29 |
30 | assertEquals(new Integer(200), result.getHttpStatus());
31 | assertEquals(2, result.getHeaders().size());
32 | assertEquals("c1dc8d7be85325149\ned5fc4d62b84752", result.getHeaders().get("Etag"));
33 | assertEquals("Wed, 11 Jan 2017 13:00:18 GMT", result.getHeaders().get("Date"));
34 | }
35 |
36 |
37 | @Test
38 | public void htmlParsing() {
39 | HashMap config = Maps.newHashMap();
40 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://bloomberg.com/", loadHtml("bloomberg.com"), 200, Maps.newHashMap());
41 | assertEquals("Bloomberg.com", result.getTitle());
42 | assertEquals(33, result.getMetaValues().size());
43 | assertTrue(result.getMetaValues().contains(""));
44 | assertEquals(361, result.getLinks().size());
45 | assertTrue(result.getLinks().contains("https://www.bloomberg.com/news/articles/2017-01-10/netanyahu-s-grip-on-power-under-threat-as-gift-scandal-escalates"));
46 | }
47 |
48 | @Test
49 | @Ignore
50 | public void fetchAndParse() {
51 | HashMap config = Maps.newHashMap();
52 | config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true");
53 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://www.tokenmill.lt/");
54 | assertEquals("TokenMill - Natural Language Processing", result.getTitle());
55 | assertEquals(10, result.getMetaValues().size());
56 | assertEquals(42, result.getLinks().size());
57 | assertTrue(result.getLinks().contains("http://www.tokenmill.lt/#case-monitoring"));
58 | assertTrue(result.getRobotsAllowedAll());
59 | assertFalse(result.getRobotsAllowedNone());
60 | assertTrue(result.getRobotsAllowedHome());
61 | assertEquals(Lists.newArrayList(), result.getRobotsSitemaps());
62 | assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay());
63 |
64 | }
65 |
66 | @Test
67 | @Ignore
68 | public void fetchAndParseRobotsTxt() {
69 | HashMap config = Maps.newHashMap();
70 | config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true");
71 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://www.google.com");
72 | assertFalse(result.getRobotsAllowedAll());
73 | assertFalse(result.getRobotsAllowedNone());
74 | assertTrue(result.getRobotsAllowedHome());
75 | assertTrue(result.getRobotsSitemaps().contains("http://www.gstatic.com/culturalinstitute/sitemaps/www_google_com_culturalinstitute/sitemap-index.xml"));
76 | assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay());
77 |
78 | }
79 |
80 | private static String loadHtml(String name) {
81 | try {
82 | URL htmlResource = Resources.getResource(name + ".html");
83 | return Resources.toString(htmlResource, Charsets.UTF_8);
84 | } catch (Exception e) {
85 | throw new RuntimeException(e);
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/parser/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | crawling-framework
7 | lt.tokenmill.crawling
8 | 0.3.4-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | parser
13 |
14 |
15 |
16 | lt.tokenmill.crawling
17 | data-model
18 | ${project.version}
19 |
20 |
21 | org.jsoup
22 | jsoup
23 |
24 |
25 | com.github.jsonld-java
26 | jsonld-java
27 |
28 |
29 | com.google.guava
30 | guava
31 |
32 |
33 | org.apache.commons
34 | commons-lang3
35 | 3.5
36 |
37 |
38 | org.clojure
39 | clojure
40 | 1.7.0
41 |
42 |
43 | lt.tokenmill
44 | timewords
45 | ${timewords.version}
46 |
47 |
48 | org.slf4j
49 | slf4j-log4j12
50 | ${slf4j.version}
51 | provided
52 |
53 |
54 | junit
55 | junit
56 | 4.13.1
57 | test
58 |
59 |
60 |
61 |
62 |
63 | release
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-source-plugin
69 |
70 |
71 |
72 | org.apache.maven.plugins
73 | maven-jar-plugin
74 |
75 |
76 |
77 | org.apache.maven.plugins
78 | maven-javadoc-plugin
79 |
80 |
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/PageAnalyzer.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | public class PageAnalyzer {
4 |
5 |
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/TitleParser.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 |
4 | import com.google.common.base.Strings;
5 | import com.google.common.collect.Lists;
6 | import com.google.common.collect.Maps;
7 | import lt.tokenmill.crawling.parser.data.MatchedString;
8 | import org.jsoup.nodes.Document;
9 |
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.stream.Collectors;
13 |
14 | public class TitleParser {
15 |
16 | private static final List TITLE_META_KEYS = Lists.newArrayList("og:title");
17 |
18 | public static List extractFromMeta(Document document) {
19 | String itempropValue = document.select("[itemprop*=headline]").text();
20 | if (itempropValue != null && !itempropValue.trim().isEmpty()) {
21 | return Lists.newArrayList(new MatchedString(itempropValue, "[itemprop*=headline]"));
22 | }
23 | Map metaValues = Maps.newHashMap();
24 | document.select("meta").forEach(m -> {
25 | String name = m.attr("name");
26 | String property = m.attr("property");
27 | String content = m.attr("content");
28 | if (!Strings.isNullOrEmpty(name)) {
29 | metaValues.put(name.toLowerCase(), content);
30 | } else if (!Strings.isNullOrEmpty(property)) {
31 | metaValues.put(property.toLowerCase(), content);
32 | }
33 | });
34 | return TITLE_META_KEYS.stream()
35 | .filter(k -> metaValues.get(k) != null)
36 | .map(k -> new MatchedString(metaValues.get(k), "META:" + k))
37 | .collect(Collectors.toList());
38 | }
39 | }
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedDate.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.data;
2 |
3 | import org.joda.time.DateTime;
4 |
5 | import java.util.Objects;
6 |
7 | public class MatchedDate {
8 |
9 | private String value;
10 |
11 | private String match;
12 |
13 | private String pattern;
14 |
15 | private DateTime date;
16 |
17 | public MatchedDate(String value, String match) {
18 | this.value = value;
19 | this.match = match;
20 | }
21 |
22 | public String getValue() {
23 | return value;
24 | }
25 |
26 | public void setValue(String value) {
27 | this.value = value;
28 | }
29 |
30 | public String getMatch() {
31 | return match;
32 | }
33 |
34 | public void setMatch(String match) {
35 | this.match = match;
36 | }
37 |
38 | public DateTime getDate() {
39 | return date;
40 | }
41 |
42 | public void setDate(DateTime date) {
43 | this.date = date;
44 | }
45 |
46 | public String getPattern() {
47 | return pattern;
48 | }
49 |
50 | public void setPattern(String pattern) {
51 | this.pattern = pattern;
52 | }
53 |
54 | @Override
55 | public String toString() {
56 | return "MatchedDate{" +
57 | "value='" + value + '\'' +
58 | ", match='" + match + '\'' +
59 | ", pattern='" + pattern + '\'' +
60 | ", date=" + date +
61 | '}';
62 | }
63 |
64 | @Override
65 | public boolean equals(Object o) {
66 | if (this == o) return true;
67 | if (o == null || getClass() != o.getClass()) return false;
68 | MatchedDate that = (MatchedDate) o;
69 | return Objects.equals(value, that.value);
70 | }
71 |
72 | @Override
73 | public int hashCode() {
74 | return Objects.hash(value);
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedString.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.data;
2 |
3 | import java.util.Objects;
4 |
5 | public class MatchedString {
6 |
7 | private String value;
8 |
9 | private String match;
10 |
11 | public MatchedString(String value, String match) {
12 | this.value = value;
13 | this.match = match;
14 | }
15 |
16 | public String getValue() {
17 | return value;
18 | }
19 |
20 | public void setValue(String value) {
21 | this.value = value;
22 | }
23 |
24 | public String getMatch() {
25 | return match;
26 | }
27 |
28 | public void setMatch(String match) {
29 | this.match = match;
30 | }
31 |
32 | @Override
33 | public boolean equals(Object o) {
34 | if (this == o) return true;
35 | if (o == null || getClass() != o.getClass()) return false;
36 | MatchedString that = (MatchedString) o;
37 | return Objects.equals(value, that.value);
38 | }
39 |
40 | @Override
41 | public int hashCode() {
42 | return Objects.hash(value);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/urls/UrlExtractor.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.urls;
2 |
3 | import org.jsoup.nodes.Document;
4 | import org.jsoup.select.Elements;
5 |
6 | import java.net.URI;
7 | import java.net.URISyntaxException;
8 | import java.util.HashSet;
9 | import java.util.Set;
10 | import java.util.stream.Collectors;
11 |
12 | public class UrlExtractor {
13 |
14 | private static boolean isAbsolute(String url) {
15 | try {
16 | URI uri = new URI(url);
17 | return uri.isAbsolute();
18 | } catch (URISyntaxException e) {
19 | e.printStackTrace();
20 | return false;
21 | }
22 | }
23 |
24 | private static Set extract(Document document) {
25 | Set canonicalUrls = new HashSet<>();
26 | if (document == null) {
27 | return canonicalUrls;
28 | }
29 |
30 | Elements elements = document.select("meta[property=og:url]");
31 | elements.forEach(element -> {
32 | String attr = element.attr("content");
33 | if (attr != null) {
34 | canonicalUrls.add(attr);
35 | }
36 | });
37 |
38 | elements = document.select("link[rel=canonical]");
39 | elements.forEach(element -> {
40 | String attr = element.attr("href");
41 | if (attr != null) {
42 | canonicalUrls.add(attr);
43 | }
44 | });
45 |
46 | return canonicalUrls.stream()
47 | .filter(UrlExtractor::isAbsolute)
48 | .collect(Collectors.toSet());
49 | }
50 |
51 | public static String extract(String url, Document document) {
52 | Set canonicalUrls = extract(document);
53 | if (canonicalUrls == null) {
54 | return url;
55 | } else {
56 | return canonicalUrls.stream().findFirst().orElse(url);
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/HttpSourceTester.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.utils;
2 |
3 | import com.google.common.collect.Maps;
4 | import lt.tokenmill.crawling.data.*;
5 | import lt.tokenmill.crawling.parser.ArticleExtractor;
6 | import lt.tokenmill.crawling.parser.urls.UrlFilters;
7 |
8 | import java.util.Map;
9 |
10 | import static com.google.common.base.Strings.nullToEmpty;
11 |
12 | public class HttpSourceTester {
13 |
14 | public static final String URL_ACCEPTED = "url_accepted";
15 | public static final String TITLE = "title";
16 | public static final String TEXT = "text";
17 | public static final String DATE = "date";
18 |
19 | public static Map test(HttpSource source, HttpSourceTest data) {
20 | TestResult result = new TestResult();
21 |
22 | String url = data.getUrl();
23 | UrlFilters urlFilters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters());
24 | UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url);
25 | result.acceptedUrl(filteringResult.getAccepted(), data.getUrlAccepted());
26 |
27 | String html = nullToEmpty(data.getHtml()).trim();
28 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, source, null);
29 | HttpArticle article = parseResult.getArticle();
30 | result.title(nullToEmpty(article.getTitle()), nullToEmpty(data.getTitle()));
31 | result.text(nullToEmpty(article.getText()), nullToEmpty(data.getText()));
32 | result.date(article.getPublished() != null ? DataUtils.formatInUTC(article.getPublished()) : "", nullToEmpty(data.getDate()));
33 |
34 | return result.difference();
35 | }
36 |
37 | public static class Difference {
38 |
39 | private String actual;
40 |
41 | private String expected;
42 |
43 | public Difference(String actual, String expected) {
44 | this.actual = actual;
45 | this.expected = expected;
46 | }
47 |
48 | public String getActual() {
49 | return actual;
50 | }
51 |
52 | public String getExpected() {
53 | return expected;
54 | }
55 |
56 | @Override
57 | public String toString() {
58 | return "Difference{" +
59 | "actual='" + actual + '\'' +
60 | ", expected='" + expected + '\'' +
61 | '}';
62 | }
63 | }
64 |
65 | public static class TestResult {
66 |
67 | private boolean expectedUrlAccepted;
68 | private boolean actualUrlAccepted;
69 | private String expectedTitle;
70 | private String actualTitle;
71 | private String expectedText;
72 | private String actualText;
73 | private String expectedDate;
74 | private String actualDate;
75 |
76 | void acceptedUrl(boolean actual, boolean expected) {
77 | this.expectedUrlAccepted = expected;
78 | this.actualUrlAccepted = actual;
79 | }
80 |
81 | public void title(String actual, String expected) {
82 | this.expectedTitle = expected.trim();
83 | this.actualTitle = actual.trim();
84 | }
85 |
86 | public void text(String actual, String expected) {
87 | this.expectedText = expected.trim();
88 | this.actualText = actual.trim();
89 | }
90 |
91 | public void date(String actual, String expected) {
92 | this.expectedDate = expected.trim();
93 | this.actualDate = actual.trim();
94 | }
95 |
96 | public Map difference() {
97 | Map result = Maps.newLinkedHashMap();
98 | if (expectedUrlAccepted != actualUrlAccepted) {
99 | result.put(URL_ACCEPTED,
100 | new Difference(String.valueOf(actualUrlAccepted), String.valueOf(expectedUrlAccepted)));
101 | }
102 | if (!expectedTitle.equals(actualTitle)) {
103 | result.put(TITLE, new Difference(actualTitle, expectedTitle));
104 | }
105 | if (!expectedText.equals(actualText)) {
106 | result.put(TEXT, new Difference(actualText, expectedText));
107 | }
108 | if (!expectedDate.equals(actualDate)) {
109 | result.put(DATE, new Difference(actualDate, expectedDate));
110 | }
111 | return result;
112 | }
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/QueryParser.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.utils;
2 |
3 | import com.google.common.base.Strings;
4 | import com.google.common.collect.Lists;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 | import java.util.stream.Collectors;
9 |
10 | public class QueryParser {
11 |
12 | public static List parseQuery(String query) {
13 | List result = Lists.newArrayList();
14 | if (!Strings.isNullOrEmpty(query)) {
15 | query = query.replaceAll("(\\s*[+-]\\s*)", "#SPLIT#$1");
16 | return Arrays.stream(query.split("(#SPLIT#| )"))
17 | .map(String::trim)
18 | .filter(s -> !s.isEmpty())
19 | .collect(Collectors.toList());
20 | }
21 | return result;
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/parser/src/main/java/lt/tokenmill/crawling/parser/utils/TextFilters.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.utils;
2 |
3 | import java.util.List;
4 | import java.util.Objects;
5 | import java.util.regex.Pattern;
6 |
7 | public class TextFilters {
8 |
9 | // Normalizer is of format [match regexp]-->>[replacement string]
10 | // Normalizers that don't match the format are ignored
11 | // [match regexp]s that don't compile are ignored
12 | // String t can be null.
13 | // if textNormalizers is null then t is returned.
14 | public static String normalizeText(String t, List textNormalizers) {
15 | t = Objects.toString(t, "");
16 | if (textNormalizers == null)
17 | return t;
18 | return textNormalizers.stream()
19 | .filter(tn -> tn.contains("-->>"))
20 | .reduce(t, (a, tn) -> {
21 | String[] parts = tn.split("-->>");
22 | String match = parts[0];
23 | try {
24 | Pattern.compile(match);
25 | } catch (Exception e) {
26 | return a;
27 | }
28 | String replacement = parts.length > 1 ? parts[1] : "";
29 | return a.replaceAll(match, replacement);
30 | }).trim();
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/AljazeeraExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import lt.tokenmill.crawling.data.HttpArticle;
4 | import lt.tokenmill.crawling.data.HttpSource;
5 | import org.junit.Test;
6 |
7 | import java.util.Arrays;
8 |
9 | import static junit.framework.TestCase.assertEquals;
10 |
11 | public class AljazeeraExtractorTest extends BaseArticleExtractorTest {
12 |
13 | @Test
14 | public void testFortune2() throws Exception {
15 | String html = loadArticle("aljazeera1");
16 | String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
17 | HttpArticle article = ArticleExtractor.extractArticle(html, url, getSourceConf(), null);
18 | assertEquals("2018-05-13T00:00:00.000Z", article.getPublished().toInstant().toString());
19 | }
20 |
21 | private HttpSource getSourceConf() {
22 | HttpSource source = new HttpSource();
23 | source.setDateSelectors(Arrays.asList(".article-duration"));
24 | return source;
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/BaseArticleExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import com.google.common.base.Charsets;
4 | import com.google.common.io.Resources;
5 |
6 | import java.net.URL;
7 |
8 | public abstract class BaseArticleExtractorTest {
9 |
10 | protected String loadArticle(String name) throws Exception {
11 | URL htmlResource = Resources.getResource("articles/" + name + ".html");
12 | return Resources.toString(htmlResource, Charsets.UTF_8);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/BloombergExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import lt.tokenmill.crawling.data.HttpArticle;
4 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
5 | import lt.tokenmill.crawling.data.HttpSource;
6 | import org.joda.time.DateTime;
7 | import org.joda.time.DateTimeZone;
8 | import org.junit.Test;
9 |
10 | import static org.junit.Assert.assertEquals;
11 | import static org.junit.Assert.assertTrue;
12 |
13 | public class BloombergExtractorTest extends BaseArticleExtractorTest {
14 |
15 |
16 | @Test
17 | public void testBloomberg1() throws Exception {
18 | String html = loadArticle("bloomberg1");
19 | String url = "http://www.bloomberg.com/news/articles/2016-09-08/japan-index-futures-signal-bounce-as-ecb-outlook-weighs-on-bonds";
20 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, bloombergSource(), null);
21 | HttpArticle article = parseResult.getArticle();
22 | assertEquals("Stocks Sink With Bonds, Dollar Rallies as Complacency Broken", article.getTitle());
23 | assertTrue(article.getText().contains("erted declines of this size in stocks and bonds are rare though not "));
24 | assertTrue(article.getText().startsWith("Tranquility that has enveloped global"));
25 | assertEquals(parseResult.getPublishedMatches().get(0), "META:parsely-pub-date");
26 | DateTime actualPublished = article.getPublished();
27 | DateTime expectedPublished = new DateTime(2016, 9, 8, 23, 14, 29, 36, DateTimeZone.UTC);
28 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
29 | }
30 |
31 | private HttpSource bloombergSource() {
32 | HttpSource source = new HttpSource();
33 | return source;
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/CyberscoopExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
4 | import lt.tokenmill.crawling.data.HttpSource;
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.junit.Test;
8 |
9 | import java.util.Arrays;
10 |
11 | import static org.junit.Assert.assertEquals;
12 |
13 | public class CyberscoopExtractorTest extends BaseArticleExtractorTest {
14 |
15 | private static final String TITLE_SELECTOR = "h1.article__title";
16 |
17 | private HttpSource cyberscoopSourceWithoutTitleSelector() {
18 | HttpSource source = new HttpSource();
19 | return source;
20 | }
21 |
22 | private HttpSource cyberscoopSourceWithTitleSelector() {
23 | HttpSource source = new HttpSource();
24 | source.setTitleSelectors(Arrays.asList(TITLE_SELECTOR));
25 | return source;
26 | }
27 |
28 | @Test
29 | public void testTitleExtraction000() throws Exception {
30 | String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/";
31 | String html = loadArticle("cyberscoop1");
32 | Document document = Jsoup.parse(html, url);
33 | HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithoutTitleSelector(), null);
34 | assertEquals(1, article.getTitleMatches().size());
35 | assertEquals("META:og:title", article.getTitleMatches().get(0));
36 | }
37 |
38 | @Test
39 | public void testTitleExtraction001() throws Exception {
40 | String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/";
41 | String html = loadArticle("cyberscoop1");
42 | Document document = Jsoup.parse(html, url);
43 | HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithTitleSelector(), null);
44 | assertEquals(1, article.getTitleMatches().size());
45 | assertEquals(TITLE_SELECTOR, article.getTitleMatches().get(0));
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/FortuneExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import lt.tokenmill.crawling.data.HttpArticle;
4 | import lt.tokenmill.crawling.data.HttpSource;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.TestCase.assertEquals;
8 |
9 | public class FortuneExtractorTest extends BaseArticleExtractorTest {
10 |
11 | @Test
12 | public void testFortune1() throws Exception {
13 | String html = loadArticle("fortune1");
14 | String url = "http://fortune.com/2017/04/13/susan-fowler-uber-editor-stripe/";
15 | HttpArticle article = ArticleExtractor.extractArticle(html, url, fortuneSource(), "2017/04/13");
16 | assertEquals("2017-04-13T00:00:00.000Z", article.getPublished().toInstant().toString());
17 | }
18 |
19 | private HttpSource fortuneSource() {
20 | HttpSource source = new HttpSource();
21 | return source;
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/InvestingParserTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import com.google.common.collect.Lists;
4 | import lt.tokenmill.crawling.data.HttpArticle;
5 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
6 | import lt.tokenmill.crawling.data.HttpSource;
7 | import org.joda.time.DateTime;
8 | import org.joda.time.DateTimeZone;
9 | import org.junit.Test;
10 |
11 | import static org.junit.Assert.assertEquals;
12 | import static org.junit.Assert.assertTrue;
13 |
14 | public class InvestingParserTest extends BaseArticleExtractorTest {
15 |
16 |
17 | @Test
18 | public void testInvesting1() throws Exception {
19 | String html = loadArticle("investing1");
20 | String url = "https://www.investing.com/analysis/opening-bell:-brexit,-davos-meetings-are-today%E2%80%99s-big-drivers-200172664";
21 | HttpArticleParseResult result = ArticleExtractor.extractArticleWithDetails(html, url, investingSource(), null);
22 | HttpArticle article = result.getArticle();
23 | assertEquals("Opening Bell: USD Drops, Pound Pops, Yen Soars", article.getTitle());
24 | assertTrue(article.getText().startsWith("by Eli Wright\nAs markets in the US return from the long holiday weekend"));
25 | assertTrue(article.getText().endsWith("ab Corporation (NYSE:SCHW) expects EPS of $0.36."));
26 | DateTime actualPublished = article.getPublished();
27 | DateTime expectedPublished = new DateTime(2017, 1, 17, 11, 8, 00, 00, DateTimeZone.UTC);
28 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
29 | }
30 |
31 | private HttpSource investingSource() {
32 | HttpSource source = new HttpSource();
33 | source.setTextSelectors(Lists.newArrayList("#contentSection p, #contentSection li"));
34 | source.setDateSelectors(Lists.newArrayList(".contentSectionDetails span"));
35 | source.setDateRegexps(Lists.newArrayList(".*\\((.+)\\).*"));
36 | return source;
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/KedainietisTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import lt.tokenmill.crawling.data.HttpArticle;
4 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
5 | import lt.tokenmill.crawling.data.HttpSource;
6 | import org.junit.Test;
7 |
8 | import java.util.Arrays;
9 |
10 | import static org.junit.Assert.assertEquals;
11 | import static org.junit.Assert.assertNotNull;
12 | import static org.junit.Assert.assertTrue;
13 |
14 | public class KedainietisTest extends BaseArticleExtractorTest{
15 |
16 | private HttpSource kedainietisSource() {
17 | HttpSource source = new HttpSource();
18 | source.setLanguage("lt");
19 | source.setDateSelectors(Arrays.asList("span.dtreviewed"));
20 | return source;
21 | }
22 |
23 | @Test
24 | public void testKedainietis() throws Exception {
25 | String html = loadArticle("kedainietis");
26 | String url = "http://www.kedainietis.lt/naujienos/naujienos/nedeklaravus-gyvenamosios-vietos-nepasieks-ir-sodros-mokami-alimentai-17694/";
27 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, kedainietisSource(), null);
28 | HttpArticle article = parseResult.getArticle();
29 | assertEquals("Nedeklaravus gyvenamosios vietos, nepasieks ir „Sodros“ mokami alimentai".trim(), article.getTitle().trim());
30 | assertTrue(article.getText().contains("valstybės biudžeto Lietuvoje"));
31 | assertTrue(article.getText().startsWith("Iš valstybės"));
32 | assertEquals(parseResult.getPublishedMatches().get(0), "span.dtreviewed");
33 | assertNotNull(article.getPublished());
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/ReutersExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser;
2 |
3 | import com.google.common.collect.Lists;
4 | import lt.tokenmill.crawling.data.HttpArticle;
5 | import lt.tokenmill.crawling.data.HttpArticleParseResult;
6 | import lt.tokenmill.crawling.data.HttpSource;
7 | import org.joda.time.DateTime;
8 | import org.joda.time.DateTimeZone;
9 | import org.junit.Test;
10 |
11 | import static org.junit.Assert.assertEquals;
12 | import static org.junit.Assert.assertNull;
13 | import static org.junit.Assert.assertTrue;
14 |
15 | public class ReutersExtractorTest extends BaseArticleExtractorTest {
16 |
17 |
18 | @Test
19 | public void testReuters1() throws Exception {
20 | String html = loadArticle("reuters1");
21 | String url = "http://www.reuters.com/finance/stocks/TEX/key-developments/article/3414284";
22 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null);
23 | assertEquals("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring- CNBC, citing source", article.getTitle());
24 | assertTrue(article.getText().contains("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring; Marcato supports Terex CEO - CNBC, citing source"));
25 | DateTime actualPublished = article.getPublished();
26 | DateTime expectedPublished = new DateTime(2016, 7, 28, 15, 35, DateTimeZone.UTC);
27 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
28 | }
29 |
30 | @Test
31 | public void testReuters2() throws Exception {
32 | String html = loadArticle("reuters2");
33 | String url = "http://www.reuters.com/article/idUSFWN1B40B5";
34 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, reutersSource(), null);
35 | HttpArticle article = parseResult.getArticle();
36 | assertEquals("BRIEF-Canadian Solar unit Recurrent Energy reached commercial operation of 100 MWac/134 MWp", article.getTitle());
37 | assertTrue(article.getText().contains("Unit Recurrent Energy has reached commercial operation of 100 MWac/134 MWp Mustang solar power project"));
38 | assertEquals("LD+JSON", parseResult.getPublishedMatches().get(0));
39 | DateTime expectedPublished = new DateTime(2016, 8, 23, 12, 24, 3, DateTimeZone.UTC);
40 | DateTime actualPublished = article.getPublished();
41 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
42 | }
43 |
44 | @Test
45 | public void testReuters3() throws Exception {
46 | String html = loadArticle("reuters3");
47 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
48 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null);
49 | assertEquals("Tesla touts speed and driving range with new upgraded battery", article.getTitle());
50 | assertTrue(article.getText().contains(" models. But Musk said those were both millio"));
51 | DateTime expectedPublished = new DateTime(2016, 8, 23, 22, 41, 57, DateTimeZone.UTC);
52 | DateTime actualPublished = article.getPublished();
53 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate()));
54 | }
55 |
56 | @Test
57 | public void testReutersBlog1() throws Exception {
58 | String html = loadArticle("reuters-blogs1");
59 | String url = "http://blogs.reuters.com/breakingviews/2016/08/22/pfizer-bets-14-bln-it-knows-better-than-market/";
60 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersBlogsSource(), null);
61 | assertEquals("Pfizer bets $14 bln it knows better than market", article.getTitle());
62 | assertTrue(article.getText().contains("r may believe in a far more lucrative outcom"));
63 | DateTime actualPublished = article.getPublished();
64 | assertNull(actualPublished);
65 | }
66 |
67 |
68 | private HttpSource reutersSource() {
69 | HttpSource source = new HttpSource();
70 | source.setTitleSelectors(Lists.newArrayList("h1"));
71 | source.setDateSelectors(Lists.newArrayList("#sigDevArticleText .timestamp"));
72 | source.setTextSelectors(Lists.newArrayList("#article-text p"));
73 | return source;
74 | }
75 |
76 | private HttpSource reutersBlogsSource() {
77 | HttpSource source = new HttpSource();
78 | source.setTextSelectors(Lists.newArrayList("#postcontent p"));
79 | return source;
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlExtractorTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.urls;
2 |
3 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest;
4 | import org.jsoup.Jsoup;
5 | import org.jsoup.nodes.Document;
6 | import org.junit.Test;
7 |
8 | import static org.junit.Assert.assertEquals;
9 |
10 | public class UrlExtractorTest extends BaseArticleExtractorTest {
11 |
12 | @Test
13 | public void testExtraction00() throws Exception {
14 | String html = loadArticle("aljazeera1");
15 | String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
16 | Document document = Jsoup.parse(html);
17 | assertEquals(url, UrlExtractor.extract(url, document));
18 | assertEquals("https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html", UrlExtractor.extract("", document));
19 | }
20 |
21 | @Test
22 | public void testExtraction01() throws Exception {
23 | String html = loadArticle("kedainietis");
24 | String url = "url";
25 | Document document = Jsoup.parse(html);
26 | assertEquals(url, UrlExtractor.extract(url, document));
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlFiltersTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.urls;
2 |
3 | import org.junit.Test;
4 |
5 | import java.util.Arrays;
6 |
7 | import static org.junit.Assert.assertEquals;
8 |
9 | public class UrlFiltersTest {
10 |
11 | @Test
12 | public void testURLNormalizer000() {
13 | UrlFilters urlFilters = UrlFilters.create(Arrays.asList("a-->>b"), Arrays.asList());
14 | assertEquals("bbbb", urlFilters.filterWithDetails("aaaa").getNormalized());
15 | assertEquals("bbbb", urlFilters.filterWithDetails("abba").getNormalized());
16 |
17 | urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList());
18 | String url = "http://www.tokenmill.lt/#case-understand";
19 | assertEquals("http://www.tokenmill.lt/", urlFilters.filterWithDetails(url).getNormalized());
20 | }
21 |
22 | @Test
23 | public void testURLFilters000() {
24 | String url = "http://www.tokenmill.lt/#case-understand";
25 | UrlFilters urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList("+^http://www.tokenmill.lt/.*", "-.*apache.*"));
26 | UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url);
27 | assertEquals(true, filteringResult.getAccepted());
28 | assertEquals("+^http://www.tokenmill.lt/.*", filteringResult.getFilter());
29 | assertEquals(1, filteringResult.getNormalizers().size());
30 | assertEquals("http://www.tokenmill.lt/", filteringResult.getNormalized());
31 |
32 | assertEquals("http://www.tokenmill.lt/", urlFilters.filter(url));
33 | assertEquals(null, urlFilters.filter("http://nutch.apache.org/"));
34 |
35 | filteringResult = urlFilters.filterWithDetails("http://nutch.apache.org/");
36 | assertEquals(false, filteringResult.getAccepted());
37 | assertEquals("-.*apache.*", filteringResult.getFilter());
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/utils/HttpSourceTesterTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.utils;
2 |
3 | import com.google.common.collect.Lists;
4 | import lt.tokenmill.crawling.data.DataUtils;
5 | import lt.tokenmill.crawling.data.HttpArticle;
6 | import lt.tokenmill.crawling.data.HttpSource;
7 | import lt.tokenmill.crawling.data.HttpSourceTest;
8 | import lt.tokenmill.crawling.parser.ArticleExtractor;
9 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest;
10 | import org.junit.Test;
11 |
12 | import java.util.Map;
13 |
14 | import static org.junit.Assert.assertEquals;
15 |
16 | public class HttpSourceTesterTest extends BaseArticleExtractorTest {
17 |
18 | @Test
19 | public void exactMatch() throws Exception {
20 | String html = loadArticle("reuters3");
21 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
22 |
23 | HttpSource source = new HttpSource();
24 | source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$"));
25 |
26 | HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null);
27 |
28 | HttpSourceTest sourceTest = new HttpSourceTest();
29 | sourceTest.setHtml(html);
30 | sourceTest.setUrl(url);
31 | sourceTest.setUrlAccepted(true);
32 | sourceTest.setTitle(article.getTitle());
33 | sourceTest.setDate(DataUtils.formatInUTC(article.getPublished()));
34 | sourceTest.setText(article.getText());
35 |
36 | Map differences = HttpSourceTester.test(source, sourceTest);
37 | assertEquals(0, differences.size());
38 | }
39 |
40 | @Test
41 | public void allDifferent() throws Exception {
42 | String html = loadArticle("reuters3");
43 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2";
44 |
45 | HttpSource source = new HttpSource();
46 | source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$"));
47 |
48 | HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null);
49 |
50 | HttpSourceTest sourceTest = new HttpSourceTest();
51 | sourceTest.setHtml(html);
52 | sourceTest.setUrl(url);
53 | sourceTest.setUrlAccepted(false);
54 | sourceTest.setTitle("Title");
55 | sourceTest.setDate("Published");
56 | sourceTest.setText("Text");
57 |
58 | Map differences = HttpSourceTester.test(source, sourceTest);
59 | assertEquals(4, differences.size());
60 | assertEquals("false", differences.get(HttpSourceTester.URL_ACCEPTED).getExpected());
61 | assertEquals("true", differences.get(HttpSourceTester.URL_ACCEPTED).getActual());
62 | assertEquals("Title", differences.get(HttpSourceTester.TITLE).getExpected());
63 | assertEquals(article.getTitle(), differences.get(HttpSourceTester.TITLE).getActual());
64 | assertEquals("Published", differences.get(HttpSourceTester.DATE).getExpected());
65 | assertEquals(DataUtils.formatInUTC(article.getPublished()), differences.get(HttpSourceTester.DATE).getActual());
66 | assertEquals("Text", differences.get(HttpSourceTester.TEXT).getExpected());
67 | assertEquals(article.getText(), differences.get(HttpSourceTester.TEXT).getActual());
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/parser/src/test/java/lt/tokenmill/crawling/parser/utils/QueryParserTest.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.parser.utils;
2 |
3 | import com.google.common.collect.Lists;
4 | import org.junit.Test;
5 |
6 | import java.util.List;
7 |
8 | import static org.junit.Assert.assertEquals;
9 |
10 | public class QueryParserTest {
11 |
12 | @Test
13 | public void parseQuery() {
14 | List parts = QueryParser.parseQuery("+Turkey-Inflation");
15 | assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts);
16 |
17 | parts = QueryParser.parseQuery("+Turkey -Inflation");
18 | assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts);
19 |
20 | parts = QueryParser.parseQuery("Turkey -Inflation");
21 | assertEquals(Lists.newArrayList("Turkey", "-Inflation"), parts);
22 |
23 | parts = QueryParser.parseQuery("+Turkey attack");
24 | assertEquals(Lists.newArrayList("+Turkey", "attack"), parts);
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/parser/src/test/resources/jsonld/bbc-1.json:
--------------------------------------------------------------------------------
1 | {
2 | "@context": "http:\/\/schema.org",
3 | "@type": "Article",
4 | "url": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745",
5 | "publisher": {
6 | "@type": "Organization",
7 | "name": "BBC News",
8 | "logo": {
9 | "@type": "ImageObject",
10 | "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1"
11 | }
12 | },
13 | "datePublished": "2017-08-30T10:32:11+01:00",
14 | "dateModified": "2017-08-30T10:32:11+01:00",
15 | "headline": "Venezuela: New assembly approves treason trials for opposition",
16 | "image": {
17 | "@type": "ImageObject",
18 | "width": 720,
19 | "height": 405,
20 | "url": "https:\/\/ichef-1.bbci.co.uk\/news\/720\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg"
21 | },
22 | "thumbnailUrl": "https:\/\/ichef.bbci.co.uk\/news\/208\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg",
23 | "author": {
24 | "@type": "Organization",
25 | "name": "BBC News",
26 | "logo": {
27 | "@type": "ImageObject",
28 | "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1"
29 | }
30 | },
31 | "mainEntityOfPage": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745"
32 | }
--------------------------------------------------------------------------------
/ui-commons/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | crawling-framework
7 | lt.tokenmill.crawling
8 | 0.3.4-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | ui-commons
13 |
14 |
15 |
16 | lt.tokenmill.crawling
17 | elasticsearch
18 |
19 |
20 | lt.tokenmill.crawling
21 | parser
22 |
23 |
24 |
25 |
26 |
27 | release
28 |
29 |
30 |
31 | org.apache.maven.plugins
32 | maven-source-plugin
33 |
34 |
35 |
36 | org.apache.maven.plugins
37 | maven-jar-plugin
38 |
39 |
40 |
41 | org.apache.maven.plugins
42 | maven-javadoc-plugin
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/ui-commons/src/main/java/lt/tokenmill/crawling/commonui/Configuration.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.commonui;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.util.Properties;
7 |
8 | public class Configuration {
9 |
10 | public static final Configuration INSTANCE = new Configuration();
11 |
12 | private static final String DEFAULT_CONFIG_FILE_LOCATION = "conf/development.properties";
13 | private final Properties properties = new Properties();
14 |
15 | private Configuration() {
16 | try {
17 | properties.load(new FileInputStream(new File(System.getProperty("config", DEFAULT_CONFIG_FILE_LOCATION))));
18 | } catch (IOException e) {
19 | throw new RuntimeException(e);
20 | }
21 | }
22 |
23 | public String getString(String key, String defaultValue) {
24 | return properties.getProperty(key, defaultValue);
25 | }
26 |
27 | public int getInt(String key, int defaultValue) {
28 | return Integer.parseInt(properties.getProperty(key, Integer.toString(defaultValue)));
29 | }
30 |
31 | public String getString(String key) {
32 | return properties.getProperty(key);
33 | }
34 |
35 | public int getInt(String key) {
36 | return Integer.parseInt(properties.getProperty(key));
37 | }
38 |
39 | @Override
40 | public String toString() {
41 | return "Configuration{" +
42 | "properties='" + properties + "'" +
43 | "}";
44 | }
45 | }
--------------------------------------------------------------------------------
/ui-commons/src/main/java/lt/tokenmill/crawling/commonui/ElasticSearch.java:
--------------------------------------------------------------------------------
1 | package lt.tokenmill.crawling.commonui;
2 |
3 | import lt.tokenmill.crawling.es.*;
4 |
5 | public class ElasticSearch {
6 |
7 | private static ElasticConnection CONNECTION;
8 | private static EsHttpSourceOperations HTTP_SOURCE_OPERATIONS;
9 | private static EsHttpSourceTestOperations HTTP_SOURCE_TEST_OPERATIONS;
10 | private static EsNamedQueryOperations NAMED_QUERY_OPERATIONS;
11 | private static EsDocumentOperations DOCUMENT_OPERATIONS;
12 | private static EsHttpUrlOperations URL_OPERATIONS;
13 |
14 | public static EsHttpSourceOperations getHttpSourceOperations() {
15 | if (HTTP_SOURCE_OPERATIONS == null) {
16 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM);
17 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM);
18 | HTTP_SOURCE_OPERATIONS = EsHttpSourceOperations.getInstance(getEsConnection(), index, type);
19 | }
20 | return HTTP_SOURCE_OPERATIONS;
21 | }
22 |
23 | public static EsHttpSourceTestOperations getHttpSourceTestOperations() {
24 | if (HTTP_SOURCE_TEST_OPERATIONS == null) {
25 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM);
26 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_TYPE_PARAM);
27 | HTTP_SOURCE_TEST_OPERATIONS = EsHttpSourceTestOperations.getInstance(getEsConnection(), index, type);
28 | }
29 | return HTTP_SOURCE_TEST_OPERATIONS;
30 | }
31 |
32 | public static EsNamedQueryOperations getNamedQueryOperations() {
33 | if (NAMED_QUERY_OPERATIONS == null) {
34 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_INDEX_PARAM);
35 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_TYPE_PARAM);
36 | NAMED_QUERY_OPERATIONS = EsNamedQueryOperations.getInstance(getEsConnection(), index, type);
37 | }
38 | return NAMED_QUERY_OPERATIONS;
39 | }
40 |
41 |
42 | public static EsDocumentOperations getDocumentOperations() {
43 | if (DOCUMENT_OPERATIONS == null) {
44 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_INDEX_NAME_PARAM);
45 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_DOC_TYPE_PARAM);
46 | DOCUMENT_OPERATIONS = EsDocumentOperations.getInstance(getEsConnection(), index, type);
47 | }
48 | return DOCUMENT_OPERATIONS;
49 | }
50 |
51 | public static EsHttpUrlOperations getUrlOperations() {
52 | if (URL_OPERATIONS == null) {
53 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_INDEX_NAME_PARAM);
54 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_DOC_TYPE_PARAM);
55 | URL_OPERATIONS = EsHttpUrlOperations.getInstance(getEsConnection(), index, type);
56 | }
57 | return URL_OPERATIONS;
58 | }
59 |
60 | private static ElasticConnection getEsConnection() {
61 | if (CONNECTION == null) {
62 | String hostname = Configuration.INSTANCE.getString(ElasticConstants.ES_HOSTNAME_PARAM, "localhost");
63 | int restPort = Configuration.INSTANCE.getInt(ElasticConstants.ES_REST_PORT, 9200);
64 | String restScheme = Configuration.INSTANCE.getString(ElasticConstants.ES_REST_SCHEME, "http");
65 | CONNECTION = ElasticConnection.getConnection(hostname, restPort, restScheme);
66 | }
67 | return CONNECTION;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------