├── .gitattributes ├── .github └── tokenmill-logo.svg ├── .gitignore ├── .gitlab-ci.yml ├── Dockerfile.base ├── Dockerfile.crawler ├── Dockerfile.es ├── Dockerfile.ui ├── LICENSE ├── Makefile ├── README.md ├── administration-ui ├── conf │ ├── development.properties │ └── docker-compose.properties ├── pom.xml └── src │ ├── main │ ├── java │ │ └── lt │ │ │ └── tokenmill │ │ │ └── crawling │ │ │ └── adminui │ │ │ ├── Application.java │ │ │ ├── CrawlerAdminUI.java │ │ │ ├── HttpSourceTestsCache.java │ │ │ ├── utils │ │ │ ├── CSVUtils.java │ │ │ ├── GridUtils.java │ │ │ ├── HttpSourceCSVUtils.java │ │ │ └── HttpSourceTestCSVUtils.java │ │ │ └── view │ │ │ ├── BaseView.java │ │ │ ├── HttpSourceForm.java │ │ │ ├── HttpSourceStatsWindow.java │ │ │ ├── HttpSourceTestWindow.java │ │ │ ├── HttpSourcesView.java │ │ │ ├── ImportExportView.java │ │ │ ├── imports │ │ │ ├── HttpSourceImportExport.java │ │ │ ├── HttpSourceTestImportExport.java │ │ │ └── NamedQueryImportExport.java │ │ │ ├── namedquery │ │ │ ├── NamedQueriesView.java │ │ │ ├── NamedQueryFormWindow.java │ │ │ └── NamedQueryResultsPanel.java │ │ │ ├── pageanalysis │ │ │ └── PageAnalysisView.java │ │ │ └── sourcetest │ │ │ ├── HttpSourceAllTestsWindow.java │ │ │ ├── HttpSourceTestFormWindow.java │ │ │ ├── HttpSourceTestsView.java │ │ │ └── TestResultsPanel.java │ ├── resources │ │ ├── log4j.properties │ │ └── log4j2.properties │ └── webapp │ │ └── VAADIN │ │ └── themes │ │ └── crawleradmintheme │ │ ├── addons.scss │ │ ├── crawleradmintheme.scss │ │ ├── styles.css │ │ └── styles.scss │ └── test │ ├── java │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── adminui │ │ └── utils │ │ ├── HttpSourceTestCSVUtilsTest.java │ │ └── HttpSourcesCSVUtilsTest.java │ └── resources │ └── www.tokenmill.lt.html ├── analysis-ui ├── conf │ └── development.properties ├── pom.xml └── src │ └── main │ ├── java │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── analysisui │ │ ├── AnalysisUI.java │ │ ├── Application.java │ │ ├── search │ │ └── ResultPanel.java │ │ └── view │ │ ├── BaseView.java │ │ ├── ContextCloudView.java │ │ └── SearchView.java │ ├── resources │ └── log4j.properties │ └── webapp │ └── VAADIN │ └── themes │ └── analysistheme │ ├── addons.scss │ ├── analysistheme.scss │ ├── styles.css │ └── styles.scss ├── bin ├── create-es-index.sh ├── create-es-indices.sh ├── deploy-crawler.sh ├── run-administration-ui.sh ├── run-analysis-ui.sh └── run-crawler.sh ├── crawler ├── conf │ ├── docker-compose.yaml │ └── local.yaml ├── pom.xml └── src │ ├── main │ ├── java │ │ └── lt │ │ │ └── tokenmill │ │ │ └── crawling │ │ │ └── crawler │ │ │ ├── CrawlerConstants.java │ │ │ ├── CrawlerTopology.java │ │ │ ├── DefaultServiceProvider.java │ │ │ ├── ServiceProvider.java │ │ │ ├── bolt │ │ │ ├── ArticleIndexerBolt.java │ │ │ ├── LinkExtractorBolt.java │ │ │ └── StatusUpdaterBolt.java │ │ │ ├── spout │ │ │ ├── HttpSourceConfiguration.java │ │ │ └── UrlGeneratorSpout.java │ │ │ └── utils │ │ │ ├── PrioritizedSource.java │ │ │ ├── UrlFilterUtils.java │ │ │ └── UrlFiltersCache.java │ └── resources │ │ ├── urlfilters.json │ │ └── urlfilters.txt │ └── test │ └── java │ └── lt │ └── tokenmill │ └── crawling │ └── crawler │ └── spout │ ├── UrlFilterUtilsTest.java │ └── UrlGeneratorSpoutTest.java ├── data-model ├── pom.xml └── src │ ├── main │ └── java │ │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── data │ │ ├── DataUtils.java │ │ ├── HighlightedSearchResult.java │ │ ├── HtmlAnalysisResult.java │ │ ├── HttpArticle.java │ │ ├── HttpArticleParseResult.java │ │ ├── HttpSource.java │ │ ├── HttpSourceTest.java │ │ ├── HttpUrl.java │ │ ├── NamedQuery.java │ │ └── PageableList.java │ └── test │ └── java │ └── lt │ └── tokenmill │ └── crawling │ └── data │ └── DataUtilsTest.java ├── docker-compose.dev.yml ├── docker-compose.run.yml ├── elasticsearch ├── pom.xml └── src │ ├── main │ ├── java │ │ └── lt │ │ │ └── tokenmill │ │ │ └── crawling │ │ │ └── es │ │ │ ├── BaseElasticOps.java │ │ │ ├── ElasticConnection.java │ │ │ ├── ElasticConstants.java │ │ │ ├── EsDataParser.java │ │ │ ├── EsDocumentOperations.java │ │ │ ├── EsHttpSourceOperations.java │ │ │ ├── EsHttpSourceTestOperations.java │ │ │ ├── EsHttpSourcesCache.java │ │ │ ├── EsHttpUrlOperations.java │ │ │ ├── EsNamedQueryOperations.java │ │ │ ├── Utils.java │ │ │ └── model │ │ │ └── DateHistogramValue.java │ └── resources │ │ └── indices │ │ ├── document.json │ │ ├── http_source.json │ │ ├── http_source_test.json │ │ ├── query.json │ │ └── url.json │ └── test │ ├── java │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── es │ │ ├── ElasticConnectionTest.java │ │ ├── ElasticsearchTestServer.java │ │ ├── EsDocumentOperationsTest.java │ │ ├── EsHttpSourceOperationsTest.java │ │ ├── EsHttpSourceTestOperationsTest.java │ │ ├── EsHttpUrlOperationsTestInt.java │ │ ├── IndexManager.java │ │ └── TestUtils.java │ └── resources │ ├── log4j.properties │ ├── log4j2.properties │ └── www.tokenmill.lt.html ├── page-analyzer ├── pom.xml └── src │ ├── main │ └── java │ │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── pageanalyzer │ │ └── PageAnalyzer.java │ └── test │ ├── java │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── pageanalyzer │ │ └── PageAnalyzerTest.java │ └── resources │ └── bloomberg.com.html ├── parser ├── pom.xml └── src │ ├── main │ └── java │ │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── parser │ │ ├── ArticleExtractor.java │ │ ├── DateParser.java │ │ ├── PageAnalyzer.java │ │ ├── TitleParser.java │ │ ├── data │ │ ├── MatchedDate.java │ │ └── MatchedString.java │ │ ├── urls │ │ ├── UrlExtractor.java │ │ └── UrlFilters.java │ │ └── utils │ │ ├── HttpSourceTester.java │ │ ├── JsonLdParser.java │ │ ├── QueryParser.java │ │ ├── TextFilters.java │ │ └── TextProfileSignature.java │ └── test │ ├── java │ └── lt │ │ └── tokenmill │ │ └── crawling │ │ └── parser │ │ ├── AljazeeraExtractorTest.java │ │ ├── BaseArticleExtractorTest.java │ │ ├── BloombergExtractorTest.java │ │ ├── CyberscoopExtractorTest.java │ │ ├── DateParserTest.java │ │ ├── FortuneExtractorTest.java │ │ ├── InvestingParserTest.java │ │ ├── JsonLdParserTest.java │ │ ├── KedainietisTest.java │ │ ├── ReutersExtractorTest.java │ │ ├── urls │ │ ├── UrlExtractorTest.java │ │ └── UrlFiltersTest.java │ │ └── utils │ │ ├── HttpSourceTesterTest.java │ │ ├── QueryParserTest.java │ │ ├── TextFilterTest.java │ │ └── TextProfileSignatureTest.java │ └── resources │ ├── articles │ ├── aljazeera1.html │ ├── bbc1.html │ ├── bloomberg1.html │ ├── cyberscoop1.html │ ├── fortune1.html │ ├── ft1.html │ ├── investing1.html │ ├── kedainietis.html │ ├── nbcnews1.html │ ├── reuters-blogs1.html │ ├── reuters1.html │ ├── reuters2.html │ ├── reuters3.html │ └── usanews1.html │ └── jsonld │ └── bbc-1.json ├── pom.xml └── ui-commons ├── pom.xml └── src └── main └── java └── lt └── tokenmill └── crawling └── commonui ├── Configuration.java └── ElasticSearch.java /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-vendored 2 | *.css linguist-vendored 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | *.iml 4 | *.retry 5 | 6 | **/*.gwt.xml 7 | crawler/logs/ 8 | **/.classpath 9 | **/.project 10 | **/.settings 11 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - base 3 | - test 4 | - build 5 | 6 | prepare-base-docker: 7 | stage: base 8 | image: docker:stable 9 | when: manual 10 | services: 11 | - docker:dind 12 | before_script: 13 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com 14 | script: 15 | - docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/base:latest . 16 | - docker push registry.gitlab.com/tokenmill/crawling-framework/base:latest 17 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/base:latest 18 | 19 | prepare-base-elasticsearch: 20 | stage: base 21 | image: docker:stable 22 | when: manual 23 | services: 24 | - docker:dind 25 | before_script: 26 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com 27 | script: 28 | - docker build -f Dockerfile.es -t registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest . 29 | - docker push registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest 30 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest 31 | 32 | prepare-administration-ui: 33 | stage: base 34 | image: docker:stable 35 | when: manual 36 | services: 37 | - docker:dind 38 | before_script: 39 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com 40 | script: 41 | - docker build -f Dockerfile.ui -t registry.gitlab.com/tokenmill/crawling-framework/ui:latest . 42 | - docker push registry.gitlab.com/tokenmill/crawling-framework/ui:latest 43 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/ui:latest 44 | 45 | prepare-crawler: 46 | stage: base 47 | image: docker:stable 48 | when: manual 49 | services: 50 | - docker:dind 51 | before_script: 52 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com 53 | script: 54 | - docker build -f Dockerfile.crawler -t registry.gitlab.com/tokenmill/crawling-framework/crawler:latest . 55 | - docker push registry.gitlab.com/tokenmill/crawling-framework/crawler:latest 56 | - docker rmi registry.gitlab.com/tokenmill/crawling-framework/crawler:latest 57 | 58 | unit-tests: 59 | stage: test 60 | image: registry.gitlab.com/tokenmill/crawling-framework/base:latest 61 | when: always 62 | script: 63 | - mvn clean test 64 | 65 | integration-tests: 66 | stage: test 67 | image: registry.gitlab.com/tokenmill/crawling-framework/base:latest 68 | services: 69 | - name: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest 70 | alias: elasticsearch 71 | when: always 72 | script: 73 | - mvn -Dtest=*TestInt -DfailIfNoTests=false clean test 74 | -------------------------------------------------------------------------------- /Dockerfile.base: -------------------------------------------------------------------------------- 1 | FROM maven:3.5.4-jdk-8-alpine as builder 2 | 3 | RUN mkdir -p /usr/src/cf 4 | WORKDIR /usr/src/cf 5 | 6 | COPY . . 7 | 8 | RUN mvn clean install 9 | 10 | FROM maven:3.5.4-jdk-8-alpine 11 | COPY --from=builder /root/.m2/ /root/.m2/ 12 | -------------------------------------------------------------------------------- /Dockerfile.crawler: -------------------------------------------------------------------------------- 1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder 2 | 3 | RUN mkdir -p /usr/src/cf 4 | WORKDIR /usr/src/cf 5 | 6 | COPY . . 7 | 8 | RUN cd crawler && \ 9 | mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests 10 | 11 | FROM maven:3.5.4-jdk-8-alpine 12 | RUN mkdir -p /usr/src/cf 13 | WORKDIR /usr/src/cf 14 | 15 | COPY --from=builder /usr/src/cf/crawler/target/crawler-standalone.jar crawler-standalone.jar 16 | COPY --from=builder /usr/src/cf/crawler/conf/docker-compose.yaml docker-compose.yaml 17 | 18 | CMD ["java", "-cp", "crawler-standalone.jar", "lt.tokenmill.crawling.crawler.CrawlerTopology", "-local", "-conf", "docker-compose.yaml"] 19 | -------------------------------------------------------------------------------- /Dockerfile.es: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 as builder 2 | 3 | ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/e1f115e4ca285c3c24e847c4dd4be955e0ed51c2/wait-for-it.sh /utils/wait-for-it.sh 4 | 5 | COPY bin/ bin/ 6 | COPY elasticsearch/ elasticsearch/ 7 | 8 | RUN /usr/local/bin/docker-entrypoint.sh elasticsearch -p /tmp/epid & /bin/bash /utils/wait-for-it.sh -t 0 localhost:9200 -- \ 9 | ./bin/create-es-indices.sh ; \ 10 | kill $(cat /tmp/epid) && wait $(cat /tmp/epid); exit 0; 11 | 12 | FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 13 | 14 | COPY --from=builder /usr/share/elasticsearch/data /usr/share/elasticsearch/data 15 | -------------------------------------------------------------------------------- /Dockerfile.ui: -------------------------------------------------------------------------------- 1 | FROM registry.gitlab.com/tokenmill/crawling-framework/base:latest as builder 2 | 3 | RUN mkdir -p /usr/src/cf 4 | WORKDIR /usr/src/cf 5 | 6 | COPY . . 7 | 8 | RUN cd administration-ui && mvn clean package -Pbigjar 9 | 10 | FROM maven:3.5.4-jdk-8-alpine 11 | RUN mkdir -p /usr/src/cf 12 | WORKDIR /usr/src/cf 13 | 14 | COPY --from=builder /usr/src/cf/administration-ui/target/administration-ui-standalone.jar administration-ui-standalone.jar 15 | COPY --from=builder /usr/src/cf/administration-ui/conf/docker-compose.properties docker-compose.properties 16 | 17 | CMD ["java", "-Dconfig=docker-compose.properties", "-jar", "administration-ui-standalone.jar"] 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017-2019 Tokenmill, UAB 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | unit-test: 2 | mvn clean test 3 | 4 | run-dev-env: 5 | docker-compose -f docker-compose.dev.yml pull && \ 6 | docker-compose -f docker-compose.dev.yml down && \ 7 | docker-compose -f docker-compose.dev.yml build && \ 8 | docker-compose -f docker-compose.dev.yml up --remove-orphans 9 | 10 | build-base-docker: 11 | docker build -f Dockerfile.base -t registry.gitlab.com/tokenmill/crawling-framework/deps:latest . 12 | 13 | publish-base-docker: build-base-docker 14 | docker push registry.gitlab.com/tokenmill/crawling-framework/deps:latest 15 | 16 | run-framework: 17 | docker-compose -f docker-compose.run.yml pull && \ 18 | docker-compose -f docker-compose.run.yml down && \ 19 | docker-compose -f docker-compose.run.yml build && \ 20 | docker-compose -f docker-compose.run.yml up --remove-orphans 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # Crawling Framework 6 | 7 | [![Maven Central](https://img.shields.io/maven-central/v/lt.tokenmill.crawling/crawling-framework.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22lt.tokenmill.crawling%22%20AND%20a:%22crawling-framework%22) 8 | [![pipeline status](https://gitlab.com/tokenmill/crawling-framework/badges/master/pipeline.svg)](https://gitlab.com/tokenmill/crawling-framework/commits/master) 9 | 10 | Crawling Framework aims at providing instruments to configure and run your [Storm Crawler](http://stormcrawler.net/) based crawler. It mainly aims at easing crawling of article content publishing sites like news portals or blog sites. With the help of GUI tool Crawling Framework provides you can: 11 | 12 | 1. Specify which sites to crawl. 13 | 1. Configure URL inclusion and exclusion filters, thus controlling which sections of the site will be fetched. 14 | 1. Specify which elements of the page provide information about article publication name, its title and main body. 15 | 1. Define tests which validate that extraction rules are working. 16 | 17 | Once configuration is done the Crawling Framework runs [Storm Crawler](http://stormcrawler.net/) based crawling following the rules specified in the configuration. 18 | 19 | ## Introduction 20 | 21 | We have recorded a video on how to setup and use Crawling Framework. Click on the image below to watch in on Youtube. 22 | 23 | [![Crawling Framework Intro](https://img.youtube.com/vi/AvO4lmmIuis/0.jpg)](https://www.youtube.com/watch?v=AvO4lmmIuis) 24 | 25 | ## Requirements 26 | 27 | Framework writes its configuration and stores crawled data to ElasticSearch. Before starting crawl project [install ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html) (Crawling Framework is tested to work with Elastic v7.x). 28 | 29 | Crawling Framework is a Java lib which will have to be extended to run Storm Crawler topology, thus Java (JDK8, Maven) infrastructure will be needed. 30 | 31 | ### Using password protected ElasticSearch 32 | 33 | Some providers hide ElasticSearch under authentification step (Which makes sense). Just set environment variables `ES_USERNAME` and `ES_PASSWORD` accordingly, everything else can remain the same. Authentification step will be done implicitly if proper credentials are there 34 | 35 | ## Configuring and Running a crawl 36 | 37 | See [Crawling Framework Example](https://github.com/tokenmill/crawling-framework-example) project's documentation. 38 | 39 | 40 | ## License 41 | 42 | Copyright © 2017-2019 [TokenMill UAB](http://www.tokenmill.ai). 43 | 44 | Distributed under the The Apache License, Version 2.0. 45 | -------------------------------------------------------------------------------- /administration-ui/conf/development.properties: -------------------------------------------------------------------------------- 1 | port=8081 2 | es.hostname=localhost 3 | es.transport.port=9300 4 | es.httpsource.index.name=http_sources 5 | es.httpsource.doc.type=http_source 6 | es.httpsourcetest.index.name=http_source_tests 7 | es.httpsourcetest.doc.type=http_source_test 8 | es.namedqueries.index.name=named_queries 9 | es.namedqueries.doc.type=named_query 10 | es.docs.index.name=docs 11 | es.docs.doc.type=doc 12 | es.urls.index.name=urls 13 | es.urls.doc.type=url -------------------------------------------------------------------------------- /administration-ui/conf/docker-compose.properties: -------------------------------------------------------------------------------- 1 | port=8081 2 | es.hostname=elasticsearch 3 | es.transport.port=9300 4 | es.httpsource.index.name=http_sources 5 | es.httpsource.doc.type=http_source 6 | es.httpsourcetest.index.name=http_source_tests 7 | es.httpsourcetest.doc.type=http_source_test 8 | es.namedqueries.index.name=named_queries 9 | es.namedqueries.doc.type=named_query 10 | es.docs.index.name=docs 11 | es.docs.doc.type=doc 12 | es.urls.index.name=urls 13 | es.urls.doc.type=url -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/Application.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui; 2 | 3 | import com.vaadin.server.VaadinServlet; 4 | import lt.tokenmill.crawling.commonui.Configuration; 5 | import org.eclipse.jetty.security.*; 6 | import org.eclipse.jetty.security.authentication.BasicAuthenticator; 7 | import org.eclipse.jetty.server.Server; 8 | import org.eclipse.jetty.servlet.ServletContextHandler; 9 | import org.eclipse.jetty.servlet.ServletHolder; 10 | import org.eclipse.jetty.util.security.Constraint; 11 | import org.eclipse.jetty.util.security.Credential; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | 16 | public class Application { 17 | 18 | private static final Logger LOG = LoggerFactory.getLogger(Application.class); 19 | private static final Boolean PRODUCTION_MODE = true; 20 | 21 | private static SecurityHandler basicAuth(String username, String password, String realm) { 22 | 23 | HashLoginService l = new HashLoginService(); 24 | l.putUser(username, Credential.getCredential(password), new String[]{"editor"}); 25 | l.setName(realm); 26 | 27 | Constraint constraint = new Constraint(); 28 | constraint.setName(Constraint.__BASIC_AUTH); 29 | constraint.setRoles(new String[]{"editor"}); 30 | constraint.setAuthenticate(true); 31 | 32 | ConstraintMapping cm = new ConstraintMapping(); 33 | cm.setConstraint(constraint); 34 | cm.setPathSpec("/*"); 35 | 36 | ConstraintSecurityHandler csh = new ConstraintSecurityHandler(); 37 | csh.setAuthenticator(new BasicAuthenticator()); 38 | csh.setRealmName("cf"); 39 | csh.addConstraintMapping(cm); 40 | csh.setLoginService(l); 41 | 42 | return csh; 43 | 44 | } 45 | 46 | public static void main(String[] args) { 47 | int port = Configuration.INSTANCE.getInt("port", 8080); 48 | Server server = new Server(port); 49 | ServletContextHandler contextHandler 50 | = new ServletContextHandler(ServletContextHandler.SESSIONS); 51 | 52 | boolean authEnabled = Boolean.parseBoolean(Configuration.INSTANCE.getString("basicAuth", "false")); 53 | 54 | if(authEnabled) { 55 | contextHandler.setSecurityHandler(basicAuth(System.getenv("UI_USER"), System.getenv("UI_PASSWORD"), "editor")); 56 | } 57 | contextHandler.setContextPath("/"); 58 | ServletHolder sh = new ServletHolder(new VaadinServlet()); 59 | contextHandler.addServlet(sh, "/*"); 60 | contextHandler.setInitParameter("ui", CrawlerAdminUI.class.getCanonicalName()); 61 | contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE)); 62 | server.setHandler(contextHandler); 63 | 64 | 65 | try { 66 | server.start(); 67 | server.join(); 68 | } catch (Exception e) { 69 | LOG.error("Failed to start application", e); 70 | } 71 | } 72 | } -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/CrawlerAdminUI.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui; 2 | 3 | import com.vaadin.annotations.Theme; 4 | import com.vaadin.annotations.VaadinServletConfiguration; 5 | import com.vaadin.server.VaadinRequest; 6 | import com.vaadin.server.VaadinServlet; 7 | import com.vaadin.ui.UI; 8 | import lt.tokenmill.crawling.adminui.view.HttpSourcesView; 9 | 10 | import javax.servlet.annotation.WebServlet; 11 | 12 | @Theme("crawleradmintheme") 13 | public class CrawlerAdminUI extends UI { 14 | 15 | @Override 16 | protected void init(VaadinRequest vaadinRequest) { 17 | setContent(new HttpSourcesView()); 18 | } 19 | 20 | @WebServlet(urlPatterns = "/*", name = "CrawlerAdminUIServlet", asyncSupported = true) 21 | @VaadinServletConfiguration(ui = CrawlerAdminUI.class, productionMode = false) 22 | public static class CrawlerAdminUIServlet extends VaadinServlet { 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/HttpSourceTestsCache.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui; 2 | 3 | import com.google.common.base.Strings; 4 | import com.google.common.cache.Cache; 5 | import com.google.common.cache.CacheBuilder; 6 | 7 | import java.util.concurrent.TimeUnit; 8 | 9 | public class HttpSourceTestsCache { 10 | 11 | private static final Cache CACHE = CacheBuilder 12 | .newBuilder() 13 | .maximumSize(1000) 14 | .expireAfterWrite(5, TimeUnit.DAYS) 15 | .build(); 16 | 17 | public static HttpSourceTest get(String sourceUrl) { 18 | HttpSourceTest test = CACHE.getIfPresent(sourceUrl.toLowerCase()); 19 | return test != null ? test : new HttpSourceTest("", ""); 20 | } 21 | 22 | public static void put(String sourceUrl, String url, String html) { 23 | CACHE.put(sourceUrl.toLowerCase(), 24 | new HttpSourceTest(Strings.nullToEmpty(url), Strings.nullToEmpty(html))); 25 | } 26 | 27 | public static class HttpSourceTest { 28 | 29 | private String url; 30 | private String html; 31 | 32 | public HttpSourceTest(String url, String html) { 33 | this.url = url; 34 | this.html = html; 35 | } 36 | 37 | public String getUrl() { 38 | return url; 39 | } 40 | 41 | public void setUrl(String url) { 42 | this.url = url; 43 | } 44 | 45 | public String getHtml() { 46 | return html; 47 | } 48 | 49 | public void setHtml(String html) { 50 | this.html = html; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/CSVUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import com.google.common.collect.Maps; 4 | import com.opencsv.CSVReader; 5 | import com.opencsv.CSVWriter; 6 | 7 | import java.io.Reader; 8 | import java.io.StringReader; 9 | import java.io.Writer; 10 | import java.util.Map; 11 | 12 | public class CSVUtils { 13 | 14 | private static final char DEFAULT_SEPARATOR = ','; 15 | private static final char DEFAULT_QUOTE = '\"'; 16 | private static final char DEFAULT_ESCAPE = '\\'; 17 | 18 | public static CSVWriter createDefaultWriter(Writer writer) { 19 | return new CSVWriter(writer, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE); 20 | } 21 | 22 | public static CSVReader createDefaultReader(Reader reader) { 23 | return new CSVReader(reader, DEFAULT_SEPARATOR, DEFAULT_QUOTE, DEFAULT_ESCAPE); 24 | } 25 | 26 | public static CSVReader createDefaultReader(String csv) { 27 | return createDefaultReader(new StringReader(csv)); 28 | } 29 | 30 | public static Map resolveColumnIndexes(String[] columns, String[] headers) { 31 | Map result = Maps.newHashMap(); 32 | for (String column : columns) { 33 | for (int i = 0; i < headers.length; i++) { 34 | if (headers[i].equalsIgnoreCase(column)) { 35 | result.put(column, i); 36 | } 37 | } 38 | } 39 | return result; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/GridUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import com.google.common.base.Joiner; 4 | import com.vaadin.data.Item; 5 | import com.vaadin.data.util.PropertyValueGenerator; 6 | import com.vaadin.data.util.converter.Converter; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Locale; 11 | 12 | public class GridUtils { 13 | 14 | public static class StringListConverter implements Converter { 15 | @Override 16 | public List convertToModel(String s, Class aClass, Locale locale) throws ConversionException { 17 | return new ArrayList(); 18 | } 19 | 20 | @Override 21 | public String convertToPresentation(List list, Class aClass, Locale locale) throws ConversionException { 22 | return Joiner.on(", ").join(list); 23 | } 24 | 25 | @Override 26 | public Class getModelType() { 27 | return List.class; 28 | } 29 | 30 | @Override 31 | public Class getPresentationType() { 32 | return String.class; 33 | } 34 | } 35 | 36 | public static class UrlToLinkConverter implements Converter { 37 | 38 | @Override 39 | public String convertToModel(String string, Class aClass, Locale locale) throws ConversionException { 40 | return string; 41 | } 42 | 43 | @Override 44 | public String convertToPresentation(String string, Class aClass, Locale locale) throws ConversionException { 45 | return String.format("%s", string, string); 46 | } 47 | 48 | @Override 49 | public Class getModelType() { 50 | return String.class; 51 | } 52 | 53 | @Override 54 | public Class getPresentationType() { 55 | return String.class; 56 | } 57 | } 58 | 59 | public static class ButtonPropertyGenerator extends PropertyValueGenerator { 60 | 61 | 62 | private String name; 63 | 64 | public ButtonPropertyGenerator(String name) { 65 | this.name = name; 66 | } 67 | 68 | @Override 69 | public String getValue(Item item, Object itemId, Object propertyId) { 70 | return name; 71 | } 72 | 73 | @Override 74 | public Class getType() { 75 | return String.class; 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceCSVUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import com.google.common.base.Strings; 4 | import lt.tokenmill.crawling.data.DataUtils; 5 | import lt.tokenmill.crawling.data.HttpSource; 6 | import lt.tokenmill.crawling.es.Utils; 7 | 8 | import java.util.Map; 9 | import java.util.Objects; 10 | 11 | public class HttpSourceCSVUtils { 12 | 13 | public static final String[] CSV_COLUMNS = new String[]{ 14 | "url", "name", "language", "timezone", "enabled", 15 | "discovery_enabled", "url_crawl_delay_secs", "feed_crawl_delay_secs", 16 | "sitemap_crawl_delay_secs", "urls", "feeds", "sitemaps", 17 | "categories", "app_ids", 18 | "url_filters", "url_normalizers", "title_selectors", 19 | "text_selectors", "text_normalizers", 20 | "date_selectors", "date_regexps", "date_formats"}; 21 | 22 | public static String[] mapHttpSourceToCsvRow(HttpSource ld) { 23 | return new String[]{ 24 | ld.getUrl(), ld.getName(), ld.getLanguage(), ld.getTimezone(), 25 | String.valueOf(ld.isEnabled()), String.valueOf(ld.isDiscoveryEnabled()), 26 | Objects.toString(ld.getUrlRecrawlDelayInSecs(), ""), 27 | Objects.toString(ld.getFeedRecrawlDelayInSecs(), ""), 28 | Objects.toString(ld.getSitemapRecrawlDelayInSecs(), ""), 29 | Utils.listToText(ld.getUrls()), Utils.listToText(ld.getFeeds()), Utils.listToText(ld.getSitemaps()), 30 | Utils.listToText(ld.getCategories()), Utils.listToText(ld.getAppIds()), 31 | Utils.listToText(ld.getUrlFilters()), Utils.listToText(ld.getUrlNormalizers()), 32 | Utils.listToText(ld.getTitleSelectors()), 33 | Utils.listToText(ld.getTextSelectors()), Utils.listToText(ld.getTextNormalizers()), 34 | Utils.listToText(ld.getDateSelectors()), Utils.listToText(ld.getDateRegexps()), 35 | Utils.listToText(ld.getDateFormats()) 36 | }; 37 | } 38 | 39 | public static HttpSource mapCsvRowToHttpSource(String[] row, Map columnIndexes) { 40 | HttpSource hs = new HttpSource(); 41 | hs.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")])); 42 | hs.setName(Strings.emptyToNull(row[columnIndexes.get("name")])); 43 | hs.setLanguage(Strings.emptyToNull(row[columnIndexes.get("language")])); 44 | hs.setTimezone(Strings.emptyToNull(row[columnIndexes.get("timezone")])); 45 | hs.setEnabled(Boolean.parseBoolean(row[columnIndexes.get("enabled")])); 46 | hs.setDiscoveryEnabled(Boolean.parseBoolean(row[columnIndexes.get("discovery_enabled")])); 47 | hs.setUrlRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("url_crawl_delay_secs")])); 48 | hs.setFeedRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("feed_crawl_delay_secs")])); 49 | hs.setSitemapRecrawlDelayInSecs(DataUtils.tryParseInteger(row[columnIndexes.get("sitemap_crawl_delay_secs")])); 50 | hs.setUrls(DataUtils.parseStringList(row[columnIndexes.get("urls")])); 51 | hs.setFeeds(DataUtils.parseStringList(row[columnIndexes.get("feeds")])); 52 | hs.setSitemaps(DataUtils.parseStringList(row[columnIndexes.get("sitemaps")])); 53 | hs.setCategories(DataUtils.parseStringList(row[columnIndexes.get("categories")])); 54 | hs.setAppIds(DataUtils.parseStringList(row[columnIndexes.get("app_ids")])); 55 | hs.setUrlFilters(DataUtils.parseStringList(row[columnIndexes.get("url_filters")])); 56 | hs.setUrlNormalizers(DataUtils.parseStringList(row[columnIndexes.get("url_normalizers")])); 57 | hs.setTitleSelectors(DataUtils.parseStringList(row[columnIndexes.get("title_selectors")])); 58 | hs.setTextSelectors(DataUtils.parseStringList(row[columnIndexes.get("text_selectors")])); 59 | hs.setTextNormalizers(DataUtils.parseStringList(row[columnIndexes.get("text_normalizers")])); 60 | hs.setDateSelectors(DataUtils.parseStringList(row[columnIndexes.get("date_selectors")])); 61 | hs.setDateRegexps(DataUtils.parseStringList(row[columnIndexes.get("date_regexps")])); 62 | hs.setDateFormats(DataUtils.parseStringList(row[columnIndexes.get("date_formats")])); 63 | return hs; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.base.Strings; 5 | import com.google.common.io.BaseEncoding; 6 | import lt.tokenmill.crawling.data.HttpSourceTest; 7 | 8 | import java.util.Map; 9 | import java.util.Objects; 10 | 11 | public class HttpSourceTestCSVUtils { 12 | 13 | public static final String[] CSV_COLUMNS = new String[]{ 14 | "url", "source", "html", "url_accepted", "title", "text", "date"}; 15 | 16 | public static String[] mapHttpSourceTestToCsvRow(HttpSourceTest httpSourceTest) { 17 | return new String[]{ 18 | httpSourceTest.getUrl(), httpSourceTest.getSource(), 19 | BaseEncoding.base64().encode(httpSourceTest.getHtml().getBytes(Charsets.UTF_8)), 20 | Objects.toString(httpSourceTest.getUrlAccepted(), "false"), 21 | Strings.nullToEmpty(httpSourceTest.getTitle()), 22 | Strings.nullToEmpty(httpSourceTest.getText()), 23 | Strings.nullToEmpty(httpSourceTest.getDate()) 24 | }; 25 | } 26 | 27 | public static HttpSourceTest mapCsvRowToHttpSourceTest(String[] row, Map columnIndexes) { 28 | HttpSourceTest hst = new HttpSourceTest(); 29 | hst.setUrl(Strings.emptyToNull(row[columnIndexes.get("url")])); 30 | hst.setSource(Strings.emptyToNull(row[columnIndexes.get("source")])); 31 | hst.setHtml(new String(BaseEncoding.base64().decode(row[columnIndexes.get("html")]), Charsets.UTF_8)); 32 | hst.setUrlAccepted(Boolean.parseBoolean(row[columnIndexes.get("url_accepted")])); 33 | hst.setTitle(Strings.emptyToNull(row[columnIndexes.get("title")])); 34 | hst.setText(Strings.emptyToNull(row[columnIndexes.get("text")])); 35 | hst.setDate(Strings.emptyToNull(row[columnIndexes.get("date")])); 36 | return hst; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/BaseView.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view; 2 | 3 | 4 | import com.vaadin.ui.HorizontalLayout; 5 | import com.vaadin.ui.MenuBar; 6 | import com.vaadin.ui.UI; 7 | import com.vaadin.ui.VerticalLayout; 8 | import lt.tokenmill.crawling.adminui.view.namedquery.NamedQueriesView; 9 | import lt.tokenmill.crawling.adminui.view.pageanalysis.PageAnalysisView; 10 | import lt.tokenmill.crawling.adminui.view.sourcetest.HttpSourceTestsView; 11 | 12 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE; 13 | 14 | public class BaseView extends VerticalLayout { 15 | 16 | public BaseView(String title) { 17 | UI.getCurrent().getPage().setTitle(String.format("Crawler Admin | %s", title)); 18 | setWidth(100, PERCENTAGE); 19 | setSpacing(true); 20 | setMargin(true); 21 | 22 | HorizontalLayout actionBarLayout = new HorizontalLayout(); 23 | actionBarLayout.setWidth(100, PERCENTAGE); 24 | 25 | MenuBar menu = new MenuBar(); 26 | 27 | MenuBar.MenuItem dataItem = menu.addItem("Configuration", null); 28 | dataItem.addItem("HTTP Sources", (item) -> UI.getCurrent().setContent(new HttpSourcesView())); 29 | dataItem.addItem("HTTP Source Tests", (item) -> UI.getCurrent().setContent(new HttpSourceTestsView())); 30 | dataItem.addItem("Named Queries", (item) -> UI.getCurrent().setContent(new NamedQueriesView())); 31 | dataItem.addItem("Import / Export", (item) -> UI.getCurrent().setContent(new ImportExportView())); 32 | 33 | menu.addItem("Page Analysis", (item) -> UI.getCurrent().setContent(new PageAnalysisView())); 34 | 35 | actionBarLayout.addComponent(menu); 36 | 37 | addComponent(actionBarLayout); 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/HttpSourceStatsWindow.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view; 2 | 3 | 4 | import com.byteowls.vaadin.chartjs.ChartJs; 5 | import com.byteowls.vaadin.chartjs.config.BarChartConfig; 6 | import com.byteowls.vaadin.chartjs.data.BarDataset; 7 | import com.byteowls.vaadin.chartjs.data.Dataset; 8 | import com.byteowls.vaadin.chartjs.data.LineDataset; 9 | import com.byteowls.vaadin.chartjs.options.Position; 10 | import com.vaadin.ui.Component; 11 | import com.vaadin.ui.Window; 12 | import lt.tokenmill.crawling.commonui.ElasticSearch; 13 | import lt.tokenmill.crawling.es.model.DateHistogramValue; 14 | 15 | import java.util.List; 16 | import java.util.stream.Collectors; 17 | 18 | public class HttpSourceStatsWindow extends Window { 19 | 20 | public HttpSourceStatsWindow(String sourceUrl) { 21 | setModal(true); 22 | center(); 23 | setCaption(String.format("%s crawling statistics", sourceUrl)); 24 | setWidth(50, Unit.PERCENTAGE); 25 | setHeight(50, Unit.PERCENTAGE); 26 | List urls = ElasticSearch.getUrlOperations().calculateStats(sourceUrl); 27 | List documents = ElasticSearch.getDocumentOperations().calculateStats(sourceUrl); 28 | Component layout = getChart(sourceUrl, urls, documents); 29 | layout.setWidth(100, Unit.PERCENTAGE); 30 | setContent(layout); 31 | } 32 | 33 | public Component getChart(String sourceUrl, List urls, List documents) { 34 | BarChartConfig config = new BarChartConfig(); 35 | 36 | BarDataset docsDataset = new BarDataset().type().label("Fetched Documents") 37 | .borderColor("rgb(54, 162, 235)") 38 | .backgroundColor("rgb(54, 162, 235)") 39 | .borderWidth(2); 40 | documents.forEach(d -> docsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue()))); 41 | 42 | LineDataset urlsDataset = new LineDataset().type().label("Discovered Urls") 43 | .borderColor("rgb(75, 192, 192)") 44 | .backgroundColor("white") 45 | .borderWidth(2); 46 | urls.forEach(d -> urlsDataset.addLabeledData(d.getDate(), Double.valueOf(d.getValue()))); 47 | 48 | config.data() 49 | .labelsAsList(urls.stream().map(DateHistogramValue::getDate).collect(Collectors.toList())) 50 | .addDataset(docsDataset) 51 | .addDataset(urlsDataset) 52 | .and(); 53 | 54 | config.options() 55 | .responsive(true) 56 | .title() 57 | .display(true) 58 | .position(Position.LEFT) 59 | .and() 60 | .done(); 61 | 62 | ChartJs chart = new ChartJs(config); 63 | chart.setJsLoggingEnabled(true); 64 | return chart; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/ImportExportView.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view; 2 | 3 | import com.vaadin.ui.TabSheet; 4 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceImportExport; 5 | import lt.tokenmill.crawling.adminui.view.imports.HttpSourceTestImportExport; 6 | import lt.tokenmill.crawling.adminui.view.imports.NamedQueryImportExport; 7 | 8 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE; 9 | 10 | public class ImportExportView extends BaseView { 11 | 12 | public ImportExportView() { 13 | super("Import / Export"); 14 | TabSheet mainLayout = new TabSheet(); 15 | mainLayout.setWidth(100, PERCENTAGE); 16 | mainLayout.addTab(new HttpSourceImportExport(), "HTTP Sources"); 17 | mainLayout.addTab(new HttpSourceTestImportExport(), "HTTP Source Tests"); 18 | mainLayout.addTab(new NamedQueryImportExport(), "Named Queries"); 19 | addComponent(mainLayout); 20 | } 21 | 22 | 23 | 24 | 25 | } 26 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueriesView.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view.namedquery; 2 | 3 | import com.vaadin.data.util.BeanItemContainer; 4 | import com.vaadin.data.util.GeneratedPropertyContainer; 5 | import com.vaadin.ui.*; 6 | import lt.tokenmill.crawling.adminui.view.BaseView; 7 | import lt.tokenmill.crawling.commonui.ElasticSearch; 8 | import lt.tokenmill.crawling.data.NamedQuery; 9 | import lt.tokenmill.crawling.data.PageableList; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE; 14 | import static com.vaadin.server.Sizeable.Unit.PIXELS; 15 | 16 | public class NamedQueriesView extends BaseView { 17 | 18 | private static final Logger LOG = LoggerFactory.getLogger(NamedQueriesView.class); 19 | 20 | private Grid itemsGrid = new Grid(new GeneratedPropertyContainer(new BeanItemContainer<>(NamedQuery.class))); 21 | private Label totalCountLabel = new Label(); 22 | private TextField filterField = new TextField(); 23 | 24 | public NamedQueriesView() { 25 | super("Named Queries"); 26 | HorizontalLayout mainLayout = new HorizontalLayout(); 27 | mainLayout.setWidth(100, PERCENTAGE); 28 | mainLayout.setHeight(100, PERCENTAGE); 29 | mainLayout.setSpacing(true); 30 | 31 | VerticalLayout gridLayout = new VerticalLayout(); 32 | gridLayout.setSpacing(true); 33 | gridLayout.setWidth(100, PERCENTAGE); 34 | 35 | 36 | // Search field and create new button 37 | filterField.setInputPrompt("Enter Name..."); 38 | filterField.addTextChangeListener(event -> refreshGrid(event.getText())); 39 | 40 | Button addNewButton = new Button("Add New Query"); 41 | addNewButton.addClickListener(event -> showNamedQueryForm(new NamedQuery())); 42 | 43 | HorizontalLayout actionHeader = new HorizontalLayout(filterField, addNewButton); 44 | actionHeader.setSpacing(true); 45 | actionHeader.setWidth(100, PERCENTAGE); 46 | filterField.setWidth(100, PERCENTAGE); 47 | actionHeader.setExpandRatio(filterField, 1.0f); 48 | gridLayout.addComponent(actionHeader); 49 | 50 | // Grid 51 | itemsGrid.setWidth(100, PERCENTAGE); 52 | itemsGrid.setHeight(700, PIXELS); 53 | itemsGrid.setSelectionMode(Grid.SelectionMode.SINGLE); 54 | itemsGrid.addSelectionListener( 55 | e -> { 56 | NamedQuery nq = (NamedQuery) itemsGrid.getSelectedRow(); 57 | if (nq != null) { 58 | nq = ElasticSearch.getNamedQueryOperations().get(nq.getName()); 59 | showNamedQueryForm(nq); 60 | } 61 | }); 62 | itemsGrid.setColumns("name"); 63 | gridLayout.addComponent(itemsGrid); 64 | gridLayout.addComponent(totalCountLabel); 65 | refreshGrid(filterField.getValue()); 66 | mainLayout.addComponent(gridLayout); 67 | mainLayout.setExpandRatio(gridLayout, 1f); 68 | addComponent(mainLayout); 69 | } 70 | 71 | private void refreshGrid(String text) { 72 | PageableList data = ElasticSearch.getNamedQueryOperations().filter(text); 73 | itemsGrid.getContainerDataSource().removeAllItems(); 74 | for (NamedQuery nq : data.getItems()) { 75 | itemsGrid.getContainerDataSource().addItem(nq); 76 | } 77 | totalCountLabel.setValue(String.format("Total count: %d", data.getTotalCount())); 78 | LOG.info("Refreshed grid using filter '{}'. Total items: {}", text, data.getTotalCount()); 79 | } 80 | 81 | private void showNamedQueryForm(NamedQuery nq) { 82 | NamedQueryFormWindow formWindow = new NamedQueryFormWindow(nq); 83 | formWindow.addAfterUpdateListener(() -> refreshGrid(filterField.getValue())); 84 | UI.getCurrent().addWindow(formWindow); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/namedquery/NamedQueryResultsPanel.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view.namedquery; 2 | 3 | import com.vaadin.shared.ui.label.ContentMode; 4 | import com.vaadin.ui.Label; 5 | import com.vaadin.ui.Panel; 6 | import com.vaadin.ui.VerticalLayout; 7 | import com.vaadin.ui.themes.ValoTheme; 8 | import lt.tokenmill.crawling.data.DataUtils; 9 | import lt.tokenmill.crawling.data.HttpArticle; 10 | import lt.tokenmill.crawling.data.PageableList; 11 | 12 | public class NamedQueryResultsPanel extends Panel { 13 | 14 | public NamedQueryResultsPanel(PageableList results) { 15 | VerticalLayout layout = new VerticalLayout(); 16 | layout.setMargin(true); 17 | 18 | Label countLabel = new Label(String.format("%s documents matched", results.getTotalCount())); 19 | countLabel.addStyleName(ValoTheme.LABEL_LARGE); 20 | countLabel.setSizeFull(); 21 | layout.addComponent(countLabel); 22 | 23 | for (HttpArticle article : results.getItems()) { 24 | String labelHtml = String.format("%s %s - %s", 25 | DataUtils.formatInUTC(article.getPublished()), article.getUrl(), article.getTitle(), article.getSource()); 26 | Label articleLabel = new Label(labelHtml); 27 | articleLabel.setContentMode(ContentMode.HTML); 28 | articleLabel.setSizeFull(); 29 | layout.addComponent(articleLabel); 30 | } 31 | setContent(layout); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/HttpSourceAllTestsWindow.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view.sourcetest; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.vaadin.ui.*; 5 | import com.vaadin.ui.themes.ValoTheme; 6 | import lt.tokenmill.crawling.commonui.ElasticSearch; 7 | import lt.tokenmill.crawling.data.HttpSource; 8 | import lt.tokenmill.crawling.data.HttpSourceTest; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.util.List; 13 | 14 | public class HttpSourceAllTestsWindow extends Window { 15 | 16 | private static final Logger LOG = LoggerFactory.getLogger(HttpSourceAllTestsWindow.class); 17 | 18 | private List afterUpdateListeners = Lists.newArrayList(); 19 | 20 | private Button cancelButton = new Button("Close", (event) -> this.close()); 21 | 22 | public HttpSourceAllTestsWindow() { 23 | setCaption("All Tests"); 24 | setModal(true); 25 | center(); 26 | setWidth(80, Unit.PERCENTAGE); 27 | setHeight(80, Unit.PERCENTAGE); 28 | 29 | VerticalLayout mainLayout = new VerticalLayout(); 30 | mainLayout.setMargin(true); 31 | 32 | List tests = ElasticSearch.getHttpSourceTestOperations().all(); 33 | for (HttpSourceTest test : tests) { 34 | HttpSource source = ElasticSearch.getHttpSourceOperations().get(test.getSource()); 35 | if (source == null) { 36 | Label noSourceLabel = new Label(String.format("Source configuration '%s' not found", test.getSource())); 37 | noSourceLabel.addStyleName(ValoTheme.LABEL_FAILURE); 38 | noSourceLabel.setSizeFull(); 39 | mainLayout.addComponent(noSourceLabel); 40 | } else { 41 | mainLayout.addComponent(new TestResultsPanel(source, test)); 42 | } 43 | } 44 | 45 | HorizontalLayout actions = new HorizontalLayout(cancelButton); 46 | actions.setSpacing(true); 47 | 48 | setContent(mainLayout); 49 | } 50 | } -------------------------------------------------------------------------------- /administration-ui/src/main/java/lt/tokenmill/crawling/adminui/view/sourcetest/TestResultsPanel.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.view.sourcetest; 2 | 3 | import com.vaadin.ui.*; 4 | import com.vaadin.ui.themes.ValoTheme; 5 | import lt.tokenmill.crawling.data.HttpSource; 6 | import lt.tokenmill.crawling.data.HttpSourceTest; 7 | import lt.tokenmill.crawling.parser.utils.HttpSourceTester; 8 | 9 | import java.util.Map; 10 | 11 | public class TestResultsPanel extends Panel { 12 | 13 | private Map difference; 14 | 15 | public TestResultsPanel(HttpSource source, HttpSourceTest test) { 16 | this.difference = HttpSourceTester.test(source, test); 17 | VerticalLayout layout = new VerticalLayout(); 18 | layout.setMargin(true); 19 | if (this.difference.isEmpty()) { 20 | Label resultLabel = new Label(String.format("'%s' Test Passed", test.getUrl())); 21 | resultLabel.addStyleName(ValoTheme.LABEL_SUCCESS); 22 | resultLabel.setSizeFull(); 23 | layout.addComponent(resultLabel); 24 | } else { 25 | Label resultLabel = new Label(String.format("'%s' Test Failed", test.getUrl())); 26 | resultLabel.addStyleName(ValoTheme.LABEL_FAILURE); 27 | resultLabel.setSizeFull(); 28 | layout.addComponent(resultLabel); 29 | } 30 | 31 | for (Map.Entry diff : difference.entrySet()) { 32 | HorizontalLayout fieldLayout = new HorizontalLayout(); 33 | fieldLayout.setSizeFull(); 34 | 35 | Label resultLabel = new Label(diff.getKey()); 36 | resultLabel.addStyleName(ValoTheme.LABEL_LARGE); 37 | fieldLayout.addComponent(resultLabel); 38 | fieldLayout.setComponentAlignment(resultLabel, Alignment.MIDDLE_CENTER); 39 | fieldLayout.setExpandRatio(resultLabel, 0.15f); 40 | 41 | FormLayout valuesLayout = new FormLayout(); 42 | valuesLayout.setWidth(100, Unit.PERCENTAGE); 43 | valuesLayout.setSizeFull(); 44 | 45 | TextArea expected = new TextArea("Expected"); 46 | expected.setSizeFull(); 47 | expected.setRows(2); 48 | expected.setValue(diff.getValue().getExpected()); 49 | expected.setReadOnly(true); 50 | 51 | TextArea actual = new TextArea("Actual"); 52 | actual.setSizeFull(); 53 | actual.setRows(2); 54 | actual.setValue(diff.getValue().getActual()); 55 | actual.setReadOnly(true); 56 | 57 | valuesLayout.addComponents(expected, actual); 58 | 59 | fieldLayout.addComponent(valuesLayout); 60 | fieldLayout.setExpandRatio(valuesLayout, 0.85f); 61 | 62 | layout.addComponent(fieldLayout); 63 | } 64 | 65 | setContent(layout); 66 | } 67 | 68 | public boolean passed() { 69 | return difference != null && difference.isEmpty(); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /administration-ui/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, stdout 2 | 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target=System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n 7 | 8 | 9 | log4j.logger.org.apache=INFO 10 | log4j.logger.org.eclipse.jetty=INFO 11 | log4j.logger.org.elasticsearch=INFO -------------------------------------------------------------------------------- /administration-ui/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/crawling-framework/987100fee5965b43e178c9096ab3b2aa3a11fac7/administration-ui/src/main/resources/log4j2.properties -------------------------------------------------------------------------------- /administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/addons.scss: -------------------------------------------------------------------------------- 1 | /* This file is automatically managed and will be overwritten from time to time. */ 2 | /* Do not manually edit this file. */ 3 | 4 | /* Import and include this mixin into your project theme to include the addon themes */ 5 | @mixin addons { 6 | } 7 | 8 | -------------------------------------------------------------------------------- /administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/crawleradmintheme.scss: -------------------------------------------------------------------------------- 1 | // If you edit this file you need to compile the theme. See README.md for details. 2 | // Global variable overrides. Must be declared before importing Valo. 3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing. 4 | 5 | //$v-font-size: 16px; 6 | //$v-font-weight: 300; 7 | //$v-font-family: "Open Sans", sans-serif; 8 | 9 | // Defines the border used by all components. 10 | //$v-border: 1px solid (v-shade 0.7); 11 | //$v-border-radius: 4px; 12 | 13 | // Affects the color of some component elements, e.g Button, Panel title, etc 14 | //$v-background-color: hsl(210, 0%, 98%); 15 | 16 | // Affects the color of content areas, e.g Panel and Window content, TextField input etc 17 | //$v-app-background-color: $v-background-color; 18 | 19 | // Affects the visual appearance of all components 20 | //$v-gradient: v-linear 8%; 21 | //$v-bevel-depth: 30%; 22 | //$v-shadow-opacity: 5%; 23 | 24 | // Defines colors for indicating status (focus, success, failure) 25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically 26 | //$v-friendly-color: #2c9720; 27 | //$v-error-indicator-color: #ed473b; 28 | 29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html 30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples 31 | 32 | @import "../valo/valo.scss"; 33 | 34 | @mixin crawleradmintheme { 35 | @include valo; 36 | 37 | // Insert your own theme rules here 38 | } -------------------------------------------------------------------------------- /administration-ui/src/main/webapp/VAADIN/themes/crawleradmintheme/styles.scss: -------------------------------------------------------------------------------- 1 | @import "crawleradmintheme.scss"; 2 | @import "addons.scss"; 3 | 4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes. 5 | // The actual styles should be defined in crawleradmintheme.scss 6 | 7 | .crawleradmintheme { 8 | @include addons; 9 | @include crawleradmintheme; 10 | } -------------------------------------------------------------------------------- /administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourceTestCSVUtilsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Resources; 5 | import lt.tokenmill.crawling.data.HttpSourceTest; 6 | import org.junit.Test; 7 | 8 | import java.net.URL; 9 | import java.time.Instant; 10 | import java.util.Map; 11 | 12 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceTestCSVUtils.CSV_COLUMNS; 13 | import static org.junit.Assert.assertEquals; 14 | 15 | public class HttpSourceTestCSVUtilsTest { 16 | 17 | protected String loadHtml(String name) throws Exception { 18 | URL htmlResource = Resources.getResource(name + ".html"); 19 | return Resources.toString(htmlResource, Charsets.UTF_8); 20 | } 21 | 22 | @Test 23 | public void testHttpSourceTestToCsvAndBack() throws Exception { 24 | HttpSourceTest httpSourceTest = new HttpSourceTest(); 25 | httpSourceTest.setUrl("http://www.tokenmill.lt/"); 26 | httpSourceTest.setSource("http://www.tokenmill.lt/"); 27 | httpSourceTest.setHtml(loadHtml("www.tokenmill.lt")); 28 | httpSourceTest.setUrlAccepted(true); 29 | httpSourceTest.setTitle("TokenMill"); 30 | httpSourceTest.setText("Some text"); 31 | httpSourceTest.setDate(Instant.now().toString()); 32 | 33 | String[] csvRow = HttpSourceTestCSVUtils.mapHttpSourceTestToCsvRow(httpSourceTest); 34 | String[] headerLine = CSV_COLUMNS; 35 | Map columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS); 36 | HttpSourceTest fromRow = HttpSourceTestCSVUtils.mapCsvRowToHttpSourceTest(csvRow, columnIndexes); 37 | assertEquals(httpSourceTest.getUrl(), fromRow.getUrl()); 38 | assertEquals(httpSourceTest.getSource(), fromRow.getSource()); 39 | assertEquals(httpSourceTest.getHtml(), fromRow.getHtml()); 40 | assertEquals(httpSourceTest.getUrlAccepted(), fromRow.getUrlAccepted()); 41 | assertEquals(httpSourceTest.getTitle(), fromRow.getTitle()); 42 | assertEquals(httpSourceTest.getText(), fromRow.getText()); 43 | assertEquals(httpSourceTest.getDate(), fromRow.getDate()); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /administration-ui/src/test/java/lt/tokenmill/crawling/adminui/utils/HttpSourcesCSVUtilsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.adminui.utils; 2 | 3 | import lt.tokenmill.crawling.data.HttpSource; 4 | import org.junit.Test; 5 | 6 | import java.util.Arrays; 7 | import java.util.Map; 8 | 9 | import static lt.tokenmill.crawling.adminui.utils.HttpSourceCSVUtils.CSV_COLUMNS; 10 | import static org.junit.Assert.assertEquals; 11 | 12 | public class HttpSourcesCSVUtilsTest { 13 | 14 | @Test 15 | public void testHttpSourcesToCsvAndBack() { 16 | HttpSource source = new HttpSource(); 17 | source.setUrl("url"); 18 | source.setName("name"); 19 | source.setLanguage("language"); 20 | source.setTimezone("timezone"); 21 | source.setEnabled(true); 22 | source.setDiscoveryEnabled(true); 23 | source.setUrlRecrawlDelayInSecs(1); 24 | source.setFeedRecrawlDelayInSecs(1); 25 | source.setSitemapRecrawlDelayInSecs(1); 26 | source.setUrls(Arrays.asList("url1", "url2")); 27 | source.setFeeds(Arrays.asList("feed1", "feed2")); 28 | source.setSitemaps(Arrays.asList("sitemap1", "sitemap2")); 29 | source.setCategories(Arrays.asList("cat1", "cat2")); 30 | source.setAppIds(Arrays.asList("app1", "app2")); 31 | source.setUrlFilters(Arrays.asList("f1", "f2")); 32 | source.setUrlNormalizers(Arrays.asList("n1", "n2")); 33 | source.setTitleSelectors(Arrays.asList("ts1", "ts2")); 34 | source.setTextSelectors(Arrays.asList("ts1", "ts2")); 35 | source.setTextNormalizers(Arrays.asList("tn1", "tn2")); 36 | source.setDateSelectors(Arrays.asList("ds1", "ds2")); 37 | source.setDateRegexps(Arrays.asList("dr1", "dr2")); 38 | source.setDateFormats(Arrays.asList("df1", "df2")); 39 | 40 | String[] row = HttpSourceCSVUtils.mapHttpSourceToCsvRow(source); 41 | String[] headerLine = CSV_COLUMNS; 42 | Map columnIndexes = CSVUtils.resolveColumnIndexes(headerLine, CSV_COLUMNS); 43 | HttpSource fromRow = HttpSourceCSVUtils.mapCsvRowToHttpSource(row, columnIndexes); 44 | assertEquals(source, fromRow); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /analysis-ui/conf/development.properties: -------------------------------------------------------------------------------- 1 | port=8080 2 | es.hostname=localhost 3 | es.transport.port=9300 4 | es.httpsource.index.name=http_sources 5 | es.httpsource.doc.type=http_source 6 | es.httpsourcetest.index.name=http_source_tests 7 | es.httpsourcetest.doc.type=http_source_test 8 | es.namedqueries.index.name=named_queries 9 | es.namedqueries.doc.type=named_query 10 | es.docs.index.name=docs 11 | es.docs.doc.type=doc -------------------------------------------------------------------------------- /analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/AnalysisUI.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.analysisui; 2 | 3 | import com.vaadin.annotations.Theme; 4 | import com.vaadin.annotations.VaadinServletConfiguration; 5 | import com.vaadin.server.VaadinRequest; 6 | import com.vaadin.server.VaadinServlet; 7 | import com.vaadin.ui.UI; 8 | import lt.tokenmill.crawling.analysisui.view.SearchView; 9 | 10 | import javax.servlet.annotation.WebServlet; 11 | 12 | @Theme("analysistheme") 13 | public class AnalysisUI extends UI { 14 | 15 | @Override 16 | protected void init(VaadinRequest vaadinRequest) { 17 | setContent(new SearchView()); 18 | } 19 | 20 | @WebServlet(urlPatterns = "/*", name = "AnalysisUIServlet", asyncSupported = true) 21 | @VaadinServletConfiguration(ui = AnalysisUI.class, productionMode = false) 22 | public static class AnalysisUIServlet extends VaadinServlet { 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/Application.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.analysisui; 2 | 3 | import com.vaadin.server.VaadinServlet; 4 | import lt.tokenmill.crawling.commonui.Configuration; 5 | import org.eclipse.jetty.server.Server; 6 | import org.eclipse.jetty.servlet.ServletContextHandler; 7 | import org.eclipse.jetty.servlet.ServletHolder; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | 12 | public class Application { 13 | 14 | private static final Logger LOG = LoggerFactory.getLogger(Application.class); 15 | private static final Boolean PRODUCTION_MODE = true; 16 | 17 | public static void main(String[] args) { 18 | int port = Configuration.INSTANCE.getInt("port", 8080); 19 | Server server = new Server(port); 20 | ServletContextHandler contextHandler 21 | = new ServletContextHandler(ServletContextHandler.SESSIONS); 22 | contextHandler.setContextPath("/"); 23 | ServletHolder sh = new ServletHolder(new VaadinServlet()); 24 | contextHandler.addServlet(sh, "/*"); 25 | contextHandler.setInitParameter("ui", AnalysisUI.class.getCanonicalName()); 26 | contextHandler.setInitParameter("productionMode", String.valueOf(PRODUCTION_MODE)); 27 | server.setHandler(contextHandler); 28 | try { 29 | server.start(); 30 | server.join(); 31 | } catch (Exception e) { 32 | LOG.error("Failed to start application", e); 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/search/ResultPanel.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.analysisui.search; 2 | 3 | import com.vaadin.shared.ui.label.ContentMode; 4 | import com.vaadin.ui.Label; 5 | import com.vaadin.ui.Panel; 6 | import com.vaadin.ui.VerticalLayout; 7 | import lt.tokenmill.crawling.data.DataUtils; 8 | import lt.tokenmill.crawling.data.HighlightedSearchResult; 9 | import lt.tokenmill.crawling.data.HttpArticle; 10 | 11 | import java.util.stream.Collectors; 12 | 13 | public class ResultPanel extends Panel { 14 | 15 | private static final String RESULTS_TEMPLATE = "%s %s  %s
%s"; 16 | 17 | public ResultPanel(HighlightedSearchResult searchResult) { 18 | HttpArticle article = searchResult.getArticle(); 19 | String highlights = searchResult.getHighlights().stream().collect(Collectors.joining("
...
")); 20 | String text = String.format(RESULTS_TEMPLATE, 21 | DataUtils.formatInUTC(article.getPublished()).replace("T", " "), 22 | article.getUrl(), article.getTitle(), article.getSource(), highlights); 23 | Label content = new Label(text); 24 | content.setContentMode(ContentMode.HTML); 25 | VerticalLayout component = new VerticalLayout(content); 26 | component.setMargin(true); 27 | setContent(component); 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/BaseView.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.analysisui.view; 2 | 3 | 4 | import com.vaadin.ui.HorizontalLayout; 5 | import com.vaadin.ui.MenuBar; 6 | import com.vaadin.ui.UI; 7 | import com.vaadin.ui.VerticalLayout; 8 | 9 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE; 10 | 11 | public class BaseView extends VerticalLayout { 12 | 13 | public BaseView(String title) { 14 | UI.getCurrent().getPage().setTitle(String.format("Analysis | %s", title)); 15 | setWidth(100, PERCENTAGE); 16 | setSpacing(true); 17 | setMargin(true); 18 | 19 | HorizontalLayout actionBarLayout = new HorizontalLayout(); 20 | actionBarLayout.setWidth(100, PERCENTAGE); 21 | 22 | MenuBar menu = new MenuBar(); 23 | 24 | menu.addItem("Search", (item) -> UI.getCurrent().setContent(new SearchView())); 25 | menu.addItem("Context Cloud", (item) -> UI.getCurrent().setContent(new ContextCloudView())); 26 | 27 | actionBarLayout.addComponent(menu); 28 | 29 | addComponent(actionBarLayout); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /analysis-ui/src/main/java/lt/tokenmill/crawling/analysisui/view/SearchView.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.analysisui.view; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.vaadin.ui.*; 5 | import lt.tokenmill.crawling.analysisui.search.ResultPanel; 6 | import lt.tokenmill.crawling.commonui.ElasticSearch; 7 | import lt.tokenmill.crawling.data.HighlightedSearchResult; 8 | import lt.tokenmill.crawling.data.NamedQuery; 9 | import lt.tokenmill.crawling.data.PageableList; 10 | import lt.tokenmill.crawling.parser.utils.QueryParser; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.util.List; 15 | import java.util.stream.Collectors; 16 | 17 | import static com.vaadin.server.Sizeable.Unit.PERCENTAGE; 18 | 19 | public class SearchView extends BaseView { 20 | 21 | private static final Logger LOG = LoggerFactory.getLogger(SearchView.class); 22 | 23 | private TextField filterField = new TextField(); 24 | private Label queryDescriptionLabel = new Label(); 25 | private VerticalLayout resultLayout = new VerticalLayout(); 26 | 27 | public SearchView() { 28 | super("Search"); 29 | Button searchButton = new Button("Search"); 30 | searchButton.addClickListener(event -> search()); 31 | 32 | VerticalLayout searchLayout = new VerticalLayout(); 33 | searchLayout.setSpacing(true); 34 | searchLayout.setWidth(50, PERCENTAGE); 35 | 36 | HorizontalLayout actionHeader = new HorizontalLayout(filterField, searchButton); 37 | actionHeader.setSpacing(true); 38 | actionHeader.setWidth(100, PERCENTAGE); 39 | actionHeader.setExpandRatio(filterField, 1.0f); 40 | filterField.setWidth(100, PERCENTAGE); 41 | 42 | searchLayout.addComponent(actionHeader); 43 | searchLayout.addComponent(queryDescriptionLabel); 44 | 45 | addComponent(searchLayout); 46 | setComponentAlignment(searchLayout, Alignment.TOP_CENTER); 47 | 48 | resultLayout.setWidth(80, PERCENTAGE); 49 | resultLayout.setSpacing(true); 50 | 51 | addComponent(resultLayout); 52 | setComponentAlignment(resultLayout, Alignment.TOP_CENTER); 53 | 54 | } 55 | 56 | private void search() { 57 | resultLayout.removeAllComponents(); 58 | List query = QueryParser.parseQuery(filterField.getValue()); 59 | LOG.info("Parsed '{}' from query '{}'", query, filterField.getValue()); 60 | List includedNamed = Lists.newArrayList(); 61 | List excludedNamed = Lists.newArrayList(); 62 | StringBuilder additionalQuery = new StringBuilder(); 63 | for (String q : query) { 64 | boolean excluded = q.startsWith("-"); 65 | String name = q.replaceAll("^[+-]+", ""); 66 | NamedQuery namedQuery = ElasticSearch.getNamedQueryOperations().get(name); 67 | if (namedQuery != null && excluded) { 68 | excludedNamed.add(namedQuery); 69 | LOG.info("Named query '{}' is negative", namedQuery.getName()); 70 | } else if (namedQuery != null) { 71 | includedNamed.add(namedQuery); 72 | LOG.info("Named query '{}' is positive", namedQuery.getName()); 73 | } else { 74 | additionalQuery.append(" ").append(q); 75 | } 76 | } 77 | LOG.info("Additional query: '{}'", additionalQuery.toString().trim()); 78 | PageableList result = ElasticSearch.getDocumentOperations().query(includedNamed, excludedNamed, additionalQuery.toString().trim()); 79 | List namedQueries = Lists.newArrayList(includedNamed); 80 | namedQueries.addAll(excludedNamed); 81 | 82 | queryDescriptionLabel.setValue(String.format("Named Queries: %s, Additional Query: '%s'", 83 | namedQueries.stream().map(NamedQuery::getName).collect(Collectors.joining("', '", "'", "'")), 84 | additionalQuery.toString().trim())); 85 | 86 | for (HighlightedSearchResult r : result.getItems()) { 87 | resultLayout.addComponent(new ResultPanel(r)); 88 | } 89 | 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /analysis-ui/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, stdout 2 | 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target=System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c - %m%n 7 | 8 | 9 | log4j.logger.org.apache=INFO 10 | log4j.logger.org.eclipse.jetty=INFO 11 | log4j.logger.org.elasticsearch=INFO -------------------------------------------------------------------------------- /analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/addons.scss: -------------------------------------------------------------------------------- 1 | /* This file is automatically managed and will be overwritten from time to time. */ 2 | /* Do not manually edit this file. */ 3 | 4 | /* Import and include this mixin into your project theme to include the addon themes */ 5 | @mixin addons { 6 | } 7 | 8 | -------------------------------------------------------------------------------- /analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/analysistheme.scss: -------------------------------------------------------------------------------- 1 | // If you edit this file you need to compile the theme. See README.md for details. 2 | // Global variable overrides. Must be declared before importing Valo. 3 | // Defines the plaintext font size, weight and family. Font size affects general component sizing. 4 | 5 | //$v-font-size: 16px; 6 | //$v-font-weight: 300; 7 | //$v-font-family: "Open Sans", sans-serif; 8 | 9 | // Defines the border used by all components. 10 | //$v-border: 1px solid (v-shade 0.7); 11 | //$v-border-radius: 4px; 12 | 13 | // Affects the color of some component elements, e.g Button, Panel title, etc 14 | //$v-background-color: hsl(210, 0%, 98%); 15 | 16 | // Affects the color of content areas, e.g Panel and Window content, TextField input etc 17 | //$v-app-background-color: $v-background-color; 18 | 19 | // Affects the visual appearance of all components 20 | //$v-gradient: v-linear 8%; 21 | //$v-bevel-depth: 30%; 22 | //$v-shadow-opacity: 5%; 23 | 24 | // Defines colors for indicating status (focus, success, failure) 25 | //$v-focus-color: valo-focus-color(); // Calculates a suitable color automatically 26 | //$v-friendly-color: #2c9720; 27 | //$v-error-indicator-color: #ed473b; 28 | 29 | // For more information, see: https://vaadin.com/book/-/page/themes.valo.html 30 | // Example variants can be copy/pasted from https://vaadin.com/wiki/-/wiki/Main/Valo+Examples 31 | 32 | @import "../valo/valo.scss"; 33 | 34 | @mixin analysistheme { 35 | @include valo; 36 | 37 | // Insert your own theme rules here 38 | } -------------------------------------------------------------------------------- /analysis-ui/src/main/webapp/VAADIN/themes/analysistheme/styles.scss: -------------------------------------------------------------------------------- 1 | @import "analysistheme.scss"; 2 | @import "addons.scss"; 3 | 4 | // This file prefixes all rules with the theme name to avoid causing conflicts with other themes. 5 | // The actual styles should be defined in analysisheme.scss 6 | 7 | .analysistheme { 8 | @include addons; 9 | @include analysistheme; 10 | } -------------------------------------------------------------------------------- /bin/create-es-index.sh: -------------------------------------------------------------------------------- 1 | # $1 - index name (docs, http_sources) 2 | # $2 - ES index config file name 3 | # $3 - ES host 4 | # $4 - application name 5 | 6 | if [ -z "$4" ] 7 | then 8 | export INDEX_URL="http://$3:9200/$1_v1" 9 | else 10 | export INDEX_URL="http://$3:9200/$4-$1_v1" 11 | fi 12 | 13 | 14 | curl -H "Content-Type:application/json" -XDELETE "$INDEX_URL" 15 | echo 16 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL" -d @elasticsearch/src/main/resources/indices/$2 17 | echo 18 | if [ -z "$4" ] 19 | then 20 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$1" 21 | echo 22 | else 23 | curl -H "Content-Type:application/json" -XPUT "$INDEX_URL/_alias/$4-$1" 24 | echo 25 | fi 26 | -------------------------------------------------------------------------------- /bin/create-es-indices.sh: -------------------------------------------------------------------------------- 1 | bin/create-es-index.sh docs document.json ${1:-localhost} $2 2 | bin/create-es-index.sh named_queries query.json ${1:-localhost} $2 3 | bin/create-es-index.sh http_sources http_source.json ${1:-localhost} $2 4 | bin/create-es-index.sh http_source_tests http_source_test.json ${1:-localhost} $2 5 | bin/create-es-index.sh urls url.json ${1:-localhost} $2 6 | -------------------------------------------------------------------------------- /bin/deploy-crawler.sh: -------------------------------------------------------------------------------- 1 | STORM_HOME=/opt/storm/apache-storm-1.1.1 2 | mvn clean install -Pbigjar -Dstorm.scope=provided 3 | $STORM_HOME/bin/storm jar crawler/target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -conf crawler/conf/local.yaml 4 | -------------------------------------------------------------------------------- /bin/run-administration-ui.sh: -------------------------------------------------------------------------------- 1 | ( cd administration-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/administration-ui-standalone.jar ) 2 | -------------------------------------------------------------------------------- /bin/run-analysis-ui.sh: -------------------------------------------------------------------------------- 1 | ( cd analysis-ui && mvn clean package -Pbigjar && java -Dconfig=conf/development.properties -jar target/analysis-ui-standalone.jar ) -------------------------------------------------------------------------------- /bin/run-crawler.sh: -------------------------------------------------------------------------------- 1 | ( cd crawler && mvn package -Dstorm.scope=compile -Dlog4j.scope=compile -Pbigjar -DskipTests && java -cp target/crawler-standalone.jar lt.tokenmill.crawling.crawler.CrawlerTopology -local -conf conf/local.yaml ) 2 | -------------------------------------------------------------------------------- /crawler/conf/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | fetcher.server.delay: 4.5 2 | fetcher.server.min.delay: 3.0 3 | fetcher.queue.mode: "byHost" 4 | fetcher.threads.per.queue: 1 5 | fetcher.threads.number: 5 6 | 7 | partition.url.mode: "byHost" 8 | 9 | metadata.track.path: false 10 | metadata.track.depth: false 11 | metadata.transfer: 12 | - "source" 13 | 14 | http.agent.name: "NewsRadar" 15 | http.agent.version: "1.0" 16 | http.agent.description: "News Crawler" 17 | http.agent.url: "" 18 | http.agent.email: "" 19 | 20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3" 21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 22 | http.content.limit: 1048576 23 | http.store.responsetime: false 24 | http.timeout: 30000 25 | 26 | http.robots.403.allow: true 27 | 28 | protocols: "http,https" 29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol" 30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol" 31 | 32 | urlfilters.config.file: "urlfilters.json" 33 | 34 | # revisit a page monthly (value in minutes) 35 | fetchInterval.default: 44640 36 | 37 | # revisit a page with a fetch error after 2 hours (value in minutes) 38 | fetchInterval.fetch.error: 120 39 | 40 | # revisit a page with an error every month (value in minutes) 41 | fetchInterval.error: 44640 42 | 43 | # Default implementation of Scheduler 44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler" 45 | 46 | topology.workers: 1 47 | topology.sleep.spout.wait.strategy.time.ms: 5000 48 | topology.message.timeout.secs: 300 49 | topology.max.spout.pending: 100 50 | topology.debug: false 51 | 52 | # ElasticSearch configuration 53 | es.hostname: "elasticsearch" 54 | es.rest.port: 9200 55 | 56 | es.urls.index.name: "urls" 57 | es.urls.doc.type: "url" 58 | es.docs.index.name: "docs" 59 | es.docs.doc.type: "doc" 60 | es.httpsource.index.name: "http_sources" 61 | es.httpsource.doc.type: "http_source" 62 | 63 | # MetricsConsumer configuration 64 | es.metrics.addresses: "elasticsearch:9300" 65 | es.metrics.index.name: "metrics" 66 | es.metrics.doc.type: "datapoint" 67 | es.metrics.cluster.name: "elasticsearch" 68 | es.metrics.blacklist: 69 | - "__" 70 | - "uptime" 71 | - "memory" 72 | - "GC" 73 | - "newWorkerEvent" 74 | - "startTimeSecs" 75 | -------------------------------------------------------------------------------- /crawler/conf/local.yaml: -------------------------------------------------------------------------------- 1 | fetcher.server.delay: 4.5 2 | fetcher.server.min.delay: 3.0 3 | fetcher.queue.mode: "byHost" 4 | fetcher.threads.per.queue: 1 5 | fetcher.threads.number: 5 6 | 7 | partition.url.mode: "byHost" 8 | 9 | metadata.track.path: false 10 | metadata.track.depth: false 11 | metadata.transfer: 12 | - "source" 13 | 14 | http.agent.name: "NewsRadar" 15 | http.agent.version: "1.0" 16 | http.agent.description: "News Crawler" 17 | http.agent.url: "" 18 | http.agent.email: "" 19 | 20 | http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3" 21 | http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 22 | http.content.limit: 1048576 23 | http.store.responsetime: false 24 | http.timeout: 30000 25 | 26 | http.robots.403.allow: true 27 | 28 | protocols: "http,https" 29 | http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol" 30 | https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol" 31 | 32 | urlfilters.config.file: "urlfilters.json" 33 | 34 | # revisit a page monthly (value in minutes) 35 | fetchInterval.default: 44640 36 | 37 | # revisit a page with a fetch error after 2 hours (value in minutes) 38 | fetchInterval.fetch.error: 120 39 | 40 | # revisit a page with an error every month (value in minutes) 41 | fetchInterval.error: 44640 42 | 43 | # Default implementation of Scheduler 44 | scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler" 45 | 46 | topology.workers: 1 47 | topology.sleep.spout.wait.strategy.time.ms: 5000 48 | topology.message.timeout.secs: 300 49 | topology.max.spout.pending: 100 50 | topology.debug: false 51 | 52 | # ElasticSearch configuration 53 | es.hostname: "localhost" 54 | es.rest.port: 9200 55 | 56 | es.urls.index.name: "urls" 57 | es.urls.doc.type: "url" 58 | es.docs.index.name: "docs" 59 | es.docs.doc.type: "doc" 60 | es.httpsource.index.name: "http_sources" 61 | es.httpsource.doc.type: "http_source" 62 | 63 | # MetricsConsumer configuration 64 | es.metrics.addresses: "localhost:9300" 65 | es.metrics.index.name: "metrics" 66 | es.metrics.doc.type: "datapoint" 67 | es.metrics.cluster.name: "elasticsearch" 68 | es.metrics.blacklist: 69 | - "__" 70 | - "uptime" 71 | - "memory" 72 | - "GC" 73 | - "newWorkerEvent" 74 | - "startTimeSecs" 75 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerConstants.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler; 2 | 3 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt; 4 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.util.concurrent.TimeUnit; 9 | 10 | public class CrawlerConstants { 11 | private static final Logger LOG = LoggerFactory.getLogger(CrawlerConstants.class); 12 | 13 | private static long getReloadDelayInSeconds() { 14 | long reloadDelay = 300; 15 | String envVar = System.getenv("DEFAULT_SOURCE_RELOAD_DELAY"); 16 | if (envVar != null) { 17 | try { 18 | reloadDelay = Long.parseLong(envVar); 19 | } catch (NumberFormatException e) { 20 | LOG.warn("Environment variable 'DEFAULT_SOURCE_RELOAD_DELAY' is not a number '{}'", envVar); 21 | } 22 | } 23 | return reloadDelay; 24 | } 25 | 26 | public static final long MIN_FETCH_DELAY = TimeUnit.MINUTES.toMillis(1); 27 | public static final long DEFAULT_URL_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10); 28 | public static final long DEFAULT_FEED_FETCH_DELAY = TimeUnit.MINUTES.toMillis(10); 29 | public static final long DEFAULT_SITEMAP_FETCH_DELAY = TimeUnit.MINUTES.toMillis(30); 30 | public static final long DEFAULT_SOURCE_RELOAD_DELAY = TimeUnit.SECONDS.toMillis(getReloadDelayInSeconds()); 31 | 32 | public static final String META_IS_SITEMAP = SiteMapParserBolt.isSitemapKey; 33 | public static final String META_IS_FEED = FeedParserBolt.isFeedKey; 34 | public static final String META_IS_SEED = "isSeed"; 35 | public static final String META_SOURCE = "source"; 36 | public static final String META_PUBLISHED = "published"; 37 | public static final String META_DISCOVERED = "discovered"; 38 | public static final String META_FEED_PUBLISHED = "feed.publishedDate"; 39 | 40 | public static final String URL_FILTERS_FILE = "urlfilters.config.file"; 41 | 42 | public static final String PARTIAL_ANALYSIS_STATUS = "PARTIAL_ANALYSIS"; 43 | } 44 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/CrawlerTopology.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler; 2 | 3 | import com.digitalpebble.stormcrawler.ConfigurableTopology; 4 | import com.digitalpebble.stormcrawler.Constants; 5 | import com.digitalpebble.stormcrawler.bolt.FeedParserBolt; 6 | import com.digitalpebble.stormcrawler.bolt.FetcherBolt; 7 | import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt; 8 | import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt; 9 | import lt.tokenmill.crawling.crawler.bolt.ArticleIndexerBolt; 10 | import lt.tokenmill.crawling.crawler.bolt.LinkExtractorBolt; 11 | import lt.tokenmill.crawling.crawler.bolt.StatusUpdaterBolt; 12 | import lt.tokenmill.crawling.crawler.spout.UrlGeneratorSpout; 13 | import org.apache.storm.Config; 14 | import org.apache.storm.topology.IRichBolt; 15 | import org.apache.storm.topology.IRichSpout; 16 | import org.apache.storm.topology.TopologyBuilder; 17 | import org.apache.storm.tuple.Fields; 18 | 19 | public class CrawlerTopology extends ConfigurableTopology { 20 | 21 | private final ServiceProvider serviceProvider; 22 | 23 | public static void main(String[] args) throws Exception { 24 | ConfigurableTopology.start(new CrawlerTopology(), args); 25 | } 26 | 27 | public CrawlerTopology() { 28 | this(new DefaultServiceProvider()); 29 | } 30 | 31 | public CrawlerTopology(ServiceProvider serviceProvider) { 32 | this.serviceProvider = serviceProvider; 33 | } 34 | 35 | @Override 36 | protected int run(String[] strings) { 37 | TopologyBuilder builder = new TopologyBuilder(); 38 | 39 | builder.setSpout("generator", createUrlGeneratorSpout(serviceProvider)); 40 | 41 | builder.setBolt("partitioner", new URLPartitionerBolt()) 42 | .shuffleGrouping("generator"); 43 | 44 | builder.setBolt("fetch", new FetcherBolt()) 45 | .fieldsGrouping("partitioner", new Fields("key")); 46 | 47 | builder.setBolt("sitemap", new SiteMapParserBolt()) 48 | .localOrShuffleGrouping("fetch"); 49 | 50 | builder.setBolt("feed", new FeedParserBolt()) 51 | .localOrShuffleGrouping("sitemap"); 52 | 53 | builder.setBolt("links", createLinkExtractor(serviceProvider)) 54 | .localOrShuffleGrouping("feed"); 55 | 56 | builder.setBolt("index", createArticleIndexer(serviceProvider)) 57 | .localOrShuffleGrouping("fetch"); 58 | 59 | builder.setBolt("status", createStatusUpdater(serviceProvider)) 60 | .localOrShuffleGrouping("fetch", Constants.StatusStreamName) 61 | .localOrShuffleGrouping("sitemap", Constants.StatusStreamName) 62 | .localOrShuffleGrouping("index", Constants.StatusStreamName) 63 | .localOrShuffleGrouping("links", Constants.StatusStreamName); 64 | 65 | String topologyName = (String) conf.getOrDefault(Config.TOPOLOGY_NAME, "crawler"); 66 | System.setProperty("es.set.netty.runtime.available.processors", "false"); 67 | return submit(topologyName, conf, builder); 68 | } 69 | 70 | protected IRichSpout createUrlGeneratorSpout(ServiceProvider serviceProvider) { 71 | return new UrlGeneratorSpout(serviceProvider); 72 | } 73 | 74 | protected IRichBolt createLinkExtractor(ServiceProvider serviceProvider) { 75 | return new LinkExtractorBolt(serviceProvider); 76 | } 77 | 78 | protected IRichBolt createArticleIndexer(ServiceProvider serviceProvider) { 79 | return new ArticleIndexerBolt(serviceProvider); 80 | } 81 | 82 | protected IRichBolt createStatusUpdater(ServiceProvider serviceProvider) { 83 | return new StatusUpdaterBolt(serviceProvider); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler; 2 | 3 | import com.digitalpebble.stormcrawler.util.ConfUtils; 4 | import com.google.common.collect.Maps; 5 | import lt.tokenmill.crawling.es.*; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.io.Serializable; 10 | import java.util.Map; 11 | 12 | public class DefaultServiceProvider implements ServiceProvider, Serializable { 13 | 14 | private static final Logger LOG = LoggerFactory.getLogger(DefaultServiceProvider.class); 15 | 16 | private static final Map ES_CONNECTIONS = Maps.newConcurrentMap(); 17 | 18 | public static ElasticConnection getElasticConnection(Map conf) { 19 | String hostname = ConfUtils.getString(conf, ElasticConstants.ES_HOSTNAME_PARAM); 20 | int restPort = ConfUtils.getInt(conf, ElasticConstants.ES_REST_PORT, 9200); 21 | String restScheme = ConfUtils.getString(conf, ElasticConstants.ES_REST_SCHEME, "http"); 22 | if (ES_CONNECTIONS.containsKey(hostname)) { 23 | return ES_CONNECTIONS.get(hostname); 24 | } else { 25 | ElasticConnection elasticConnection = ElasticConnection.getConnection(hostname, restPort, restScheme); 26 | ES_CONNECTIONS.put(hostname, elasticConnection); 27 | return ES_CONNECTIONS.get(hostname); 28 | } 29 | } 30 | 31 | public EsHttpUrlOperations createEsHttpUrlOperations(Map conf) { 32 | ElasticConnection connection = getElasticConnection(conf); 33 | String urlsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_URLS_INDEX_NAME_PARAM); 34 | String urlsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_URLS_DOC_TYPE_PARAM); 35 | return EsHttpUrlOperations.getInstance(connection, urlsIndexName, urlsDocumentType); 36 | } 37 | 38 | public EsHttpSourceOperations createEsHttpSourceOperations(Map conf) { 39 | ElasticConnection connection = getElasticConnection(conf); 40 | String sourcesIndexName = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM); 41 | String sourcesDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM); 42 | return EsHttpSourceOperations.getInstance(connection, sourcesIndexName, sourcesDocumentType); 43 | } 44 | 45 | public EsDocumentOperations creatEsDocumentOperations(Map conf) { 46 | ElasticConnection connection = getElasticConnection(conf); 47 | String docsIndexName = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_INDEX_NAME_PARAM); 48 | String docsDocumentType = ConfUtils.getString(conf, ElasticConstants.ES_DOCS_DOC_TYPE_PARAM); 49 | return EsDocumentOperations.getInstance(connection, docsIndexName, docsDocumentType); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler; 2 | 3 | import lt.tokenmill.crawling.es.EsDocumentOperations; 4 | import lt.tokenmill.crawling.es.EsHttpSourceOperations; 5 | import lt.tokenmill.crawling.es.EsHttpUrlOperations; 6 | 7 | import java.util.Map; 8 | 9 | /*** 10 | * Interface for external service factory. 11 | */ 12 | public interface ServiceProvider { 13 | 14 | EsHttpUrlOperations createEsHttpUrlOperations(Map conf); 15 | 16 | EsHttpSourceOperations createEsHttpSourceOperations(Map conf); 17 | 18 | EsDocumentOperations creatEsDocumentOperations(Map conf); 19 | } 20 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.bolt; 2 | 3 | import com.digitalpebble.stormcrawler.Metadata; 4 | import com.digitalpebble.stormcrawler.persistence.AbstractStatusUpdaterBolt; 5 | import com.digitalpebble.stormcrawler.persistence.Status; 6 | import lt.tokenmill.crawling.crawler.CrawlerConstants; 7 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider; 8 | import lt.tokenmill.crawling.crawler.ServiceProvider; 9 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache; 10 | import lt.tokenmill.crawling.data.HttpSource; 11 | import lt.tokenmill.crawling.es.*; 12 | import lt.tokenmill.crawling.parser.urls.UrlFilters; 13 | import org.apache.storm.metric.api.MultiCountMetric; 14 | import org.apache.storm.task.OutputCollector; 15 | import org.apache.storm.task.TopologyContext; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import java.util.Date; 20 | import java.util.Map; 21 | 22 | public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt { 23 | 24 | private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class); 25 | 26 | private MultiCountMetric eventCounter; 27 | 28 | private EsHttpUrlOperations esUrlsOperations; 29 | private EsHttpSourceOperations esHttpSourcesOperations; 30 | private ServiceProvider serviceProvider; 31 | 32 | public StatusUpdaterBolt(ServiceProvider serviceProvider) { 33 | this.serviceProvider = serviceProvider; 34 | } 35 | 36 | @Override 37 | public void store(String url, Status status, Metadata metadata, Date nextFetch) throws Exception { 38 | try { 39 | String source = metadata.getFirstValue(CrawlerConstants.META_SOURCE); 40 | Boolean isSeed = Boolean.parseBoolean(metadata.getFirstValue(CrawlerConstants.META_IS_SEED)); 41 | HttpSource httpSource = EsHttpSourcesCache.get(esHttpSourcesOperations, source); 42 | UrlFilters filters = UrlFiltersCache.get(httpSource); 43 | 44 | String filtered = filters.filter(url); 45 | if (isSeed || (filtered == null && status.equals(Status.DISCOVERED))) { 46 | LOG.debug("Url '{}' is seed or rejected by filters", url); 47 | return; 48 | } 49 | 50 | String id = (filtered == null) ? url : filtered; 51 | 52 | LOG.debug("Setting '{}' status to '{}'", id, status); 53 | 54 | 55 | boolean create = status.equals(Status.DISCOVERED); 56 | String published = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED); 57 | if (published == null) { 58 | published = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED); 59 | } 60 | esUrlsOperations.upsertUrlStatus(id, published, source, create, status); 61 | 62 | if (status == Status.DISCOVERED) { 63 | eventCounter.scope("urls_discovered").incr(); 64 | } 65 | } catch (Exception e) { 66 | LOG.error("Failed to set status for url '{}'", url, e); 67 | } 68 | } 69 | 70 | 71 | @Override 72 | public void prepare(Map conf, TopologyContext context, OutputCollector outputCollector) { 73 | super.prepare(conf, context, outputCollector); 74 | this.eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10); 75 | this.esUrlsOperations = this.serviceProvider.createEsHttpUrlOperations(conf); 76 | this.esHttpSourcesOperations = this.serviceProvider.createEsHttpSourceOperations(conf); 77 | } 78 | 79 | @Override 80 | public void cleanup() { 81 | super.cleanup(); 82 | } 83 | } -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/HttpSourceConfiguration.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.spout; 2 | 3 | import com.google.common.collect.Iterables; 4 | import lt.tokenmill.crawling.crawler.CrawlerConstants; 5 | import lt.tokenmill.crawling.crawler.utils.PrioritizedSource; 6 | import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache; 7 | import lt.tokenmill.crawling.data.HttpSource; 8 | import lt.tokenmill.crawling.es.EsHttpSourcesCache; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.util.Iterator; 13 | import java.util.List; 14 | import java.util.PriorityQueue; 15 | import java.util.stream.Collectors; 16 | 17 | import static java.lang.System.currentTimeMillis; 18 | 19 | public class HttpSourceConfiguration { 20 | 21 | private static final Logger LOG = LoggerFactory.getLogger(HttpSourceConfiguration.class); 22 | 23 | private final List sources; 24 | private final List sourceUrls; 25 | private final Iterator sourceCycle; 26 | private final PriorityQueue prioritizedSources; 27 | 28 | private static long lastReloadMillis = 0; 29 | 30 | private HttpSourceConfiguration(List sources) { 31 | this.sources = sources; 32 | this.sourceUrls = sources.stream() 33 | .map(HttpSource::getUrl) 34 | .collect(Collectors.toList()); 35 | LOG.info("Loaded {} active HTTP sources", this.sourceUrls.size()); 36 | this.sourceCycle = Iterables.cycle(this.sourceUrls).iterator(); 37 | this.prioritizedSources = 38 | new PriorityQueue<>(new PrioritizedSource.PrioritizedUrlComparator()); 39 | sources.forEach(s -> { 40 | s.getUrls().forEach(u -> prioritizedSources.offer(PrioritizedSource.createUrl(u, s))); 41 | s.getFeeds().forEach(u -> prioritizedSources.offer(PrioritizedSource.createFeed(u, s))); 42 | s.getSitemaps().forEach(u -> prioritizedSources.offer(PrioritizedSource.createSitemap(u, s))); 43 | }); 44 | } 45 | 46 | public PrioritizedSource prioritized() { 47 | PrioritizedSource prioritized = prioritizedSources.peek(); 48 | if (prioritized != null && 49 | (prioritized.getNextFetchTime() <= currentTimeMillis())) { 50 | prioritized = prioritizedSources.poll(); 51 | prioritized.recalculateNextFetchTime(); 52 | prioritizedSources.offer(prioritized); 53 | return prioritized; 54 | } 55 | return null; 56 | } 57 | 58 | public int maxTries() { 59 | return Math.min(10, sourceUrls.size()); 60 | } 61 | 62 | public boolean hasNextActive() { 63 | return sourceCycle.hasNext(); 64 | } 65 | 66 | 67 | public String nextActive() { 68 | return sourceCycle.next(); 69 | } 70 | 71 | public static HttpSourceConfiguration reload(HttpSourceConfiguration current, List sources) { 72 | HttpSourceConfiguration configuration; 73 | if (current != null && current.sources.equals(sources)) { 74 | LOG.info("HTTP source configuration didn't change. Using current version"); 75 | configuration = current; 76 | } else { 77 | configuration = new HttpSourceConfiguration(sources); 78 | EsHttpSourcesCache.invalidate(); 79 | UrlFiltersCache.invalidate(); 80 | } 81 | lastReloadMillis = currentTimeMillis(); 82 | return configuration; 83 | } 84 | 85 | public static boolean needsReload() { 86 | LOG.info("Checking reloading timeout. Remaining milliseconds: {}", 87 | lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY - currentTimeMillis()); 88 | return lastReloadMillis + CrawlerConstants.DEFAULT_SOURCE_RELOAD_DELAY < currentTimeMillis(); 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/PrioritizedSource.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.utils; 2 | 3 | 4 | import lt.tokenmill.crawling.data.HttpSource; 5 | 6 | import java.io.Serializable; 7 | import java.util.Comparator; 8 | import java.util.concurrent.TimeUnit; 9 | 10 | import static lt.tokenmill.crawling.crawler.CrawlerConstants.*; 11 | 12 | public class PrioritizedSource implements Serializable { 13 | 14 | 15 | 16 | private final String url; 17 | 18 | private final HttpSource source; 19 | 20 | private Long delay = MIN_FETCH_DELAY; 21 | 22 | private boolean sitemap = false; 23 | private boolean feed = false; 24 | 25 | private Long nextFetchTime = System.currentTimeMillis(); 26 | 27 | private PrioritizedSource(String url, HttpSource source) { 28 | this.url = url; 29 | this.source = source; 30 | } 31 | 32 | private void setDelay(Long delay) { 33 | this.delay = Math.max(delay, MIN_FETCH_DELAY); 34 | } 35 | 36 | private void setSitemap(boolean sitemap) { 37 | this.sitemap = sitemap; 38 | } 39 | 40 | private void setFeed(boolean feed) { 41 | this.feed = feed; 42 | } 43 | 44 | public void recalculateNextFetchTime() { 45 | nextFetchTime = System.currentTimeMillis() + delay; 46 | } 47 | 48 | public String getUrl() { 49 | return url; 50 | } 51 | 52 | public boolean isSitemap() { 53 | return sitemap; 54 | } 55 | 56 | public boolean isFeed() { 57 | return feed; 58 | } 59 | 60 | public HttpSource getSource() { 61 | return source; 62 | } 63 | 64 | public long getNextFetchTime() { 65 | return nextFetchTime; 66 | } 67 | 68 | public static class PrioritizedUrlComparator implements Comparator, Serializable { 69 | 70 | @Override 71 | public int compare(PrioritizedSource u1, PrioritizedSource u2) { 72 | return u1.nextFetchTime.compareTo(u2.nextFetchTime); 73 | } 74 | } 75 | 76 | public static PrioritizedSource createUrl(String url, HttpSource source) { 77 | PrioritizedSource result = new PrioritizedSource(url, source); 78 | long delay = source.getUrlRecrawlDelayInSecs() != null ? 79 | TimeUnit.SECONDS.toMillis(source.getUrlRecrawlDelayInSecs()) : DEFAULT_URL_FETCH_DELAY; 80 | result.setDelay(delay); 81 | return result; 82 | } 83 | 84 | public static PrioritizedSource createFeed(String url, HttpSource source) { 85 | PrioritizedSource result = new PrioritizedSource(url, source); 86 | long delay = source.getFeedRecrawlDelayInSecs() != null ? 87 | TimeUnit.SECONDS.toMillis(source.getFeedRecrawlDelayInSecs()) : DEFAULT_FEED_FETCH_DELAY; 88 | result.setDelay(delay); 89 | result.setFeed(true); 90 | return result; 91 | } 92 | 93 | public static PrioritizedSource createSitemap(String url, HttpSource source) { 94 | PrioritizedSource result = new PrioritizedSource(url, source); 95 | long delay = source.getSitemapRecrawlDelayInSecs() != null ? 96 | TimeUnit.SECONDS.toMillis(source.getSitemapRecrawlDelayInSecs()) : DEFAULT_SITEMAP_FETCH_DELAY; 97 | result.setDelay(delay); 98 | result.setSitemap(true); 99 | return result; 100 | } 101 | } -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFilterUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.utils; 2 | 3 | import com.digitalpebble.stormcrawler.Metadata; 4 | import com.digitalpebble.stormcrawler.filtering.URLFilters; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | import java.net.URL; 10 | import java.util.Map; 11 | 12 | public class UrlFilterUtils { 13 | 14 | private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtils.class); 15 | 16 | public static URLFilters load(Map conf, String filtersConfigFile) { 17 | if (filtersConfigFile != null) { 18 | try { 19 | URLFilters loaded = new URLFilters(conf, filtersConfigFile); 20 | LOG.info("Loaded URLFilters from '{}'", filtersConfigFile); 21 | return loaded; 22 | } catch (IOException e) { 23 | LOG.error("Exception caught while loading the URLFilters"); 24 | throw new RuntimeException("Exception caught while loading the URLFilters", e); 25 | } 26 | } else { 27 | return URLFilters.emptyURLFilters; 28 | } 29 | } 30 | 31 | public static String firstMatch(URL sourceUrl, Metadata metadata, String targetUrl, URLFilters...filters) { 32 | for (URLFilters filter : filters) { 33 | String filtered = filter.filter(sourceUrl, metadata, targetUrl); 34 | if (filtered != null) { 35 | return filtered; 36 | } 37 | } 38 | return null; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /crawler/src/main/java/lt/tokenmill/crawling/crawler/utils/UrlFiltersCache.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.utils; 2 | 3 | import com.google.common.cache.Cache; 4 | import com.google.common.cache.CacheBuilder; 5 | import lt.tokenmill.crawling.data.HttpSource; 6 | import lt.tokenmill.crawling.parser.urls.UrlFilters; 7 | 8 | import java.util.concurrent.TimeUnit; 9 | 10 | public class UrlFiltersCache { 11 | 12 | private static final Cache CACHE; 13 | 14 | static { 15 | CACHE = CacheBuilder.newBuilder() 16 | .expireAfterWrite(10, TimeUnit.MINUTES) 17 | .build(); 18 | } 19 | 20 | public static UrlFilters get(HttpSource source) { 21 | UrlFilters filters = CACHE.getIfPresent(source.getUrl()); 22 | if (filters == null) { 23 | filters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters()); 24 | CACHE.put(source.getUrl(), filters); 25 | } 26 | return filters; 27 | } 28 | 29 | public static void invalidate() { 30 | CACHE.invalidateAll(); 31 | } 32 | } -------------------------------------------------------------------------------- /crawler/src/main/resources/urlfilters.json: -------------------------------------------------------------------------------- 1 | { 2 | "com.digitalpebble.stormcrawler.filtering.URLFilters": [ 3 | { 4 | "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter", 5 | "name": "RegexURLFilter", 6 | "params": { 7 | "regexFilterFile": "urlfilters.txt" 8 | } 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /crawler/src/main/resources/urlfilters.txt: -------------------------------------------------------------------------------- 1 | #Discard URLs, longer than 512 chars 2 | -.{512,} 3 | 4 | #Discard urls which are actually links to other urls 5 | -^https?://.*https?:.* 6 | 7 | #Discard urls containing illegal characters: space, %20 or # 8 | -.*(:?%20| |#|\@).* 9 | 10 | #Discard media or binary files 11 | -(?i).*\.(exe|dmg|csv|mp3|mp4|m4a|avi|mov|swf|wmv|dat|mpg|mpg4|flm|mtv|video|divx|mpeg4|film|xwmv|exo|pdf|jpg|jpeg|png|bmp|gif|doc|docx|xls|xlsx|ppt|pptx|rss)$ 12 | 13 | #Allow everything else 14 | +. -------------------------------------------------------------------------------- /crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlFilterUtilsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.spout; 2 | 3 | import com.digitalpebble.stormcrawler.Metadata; 4 | import com.digitalpebble.stormcrawler.filtering.URLFilters; 5 | import com.digitalpebble.stormcrawler.util.ConfUtils; 6 | import lt.tokenmill.crawling.crawler.CrawlerConstants; 7 | import lt.tokenmill.crawling.crawler.utils.UrlFilterUtils; 8 | import org.junit.Test; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.net.MalformedURLException; 13 | import java.net.URL; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | 17 | import static org.junit.Assert.assertNotNull; 18 | import static org.junit.Assert.assertNull; 19 | 20 | public class UrlFilterUtilsTest { 21 | 22 | private static final Logger LOG = LoggerFactory.getLogger(UrlFilterUtilsTest.class); 23 | private final String testSourceUrl = "http://www.tokenmill.lt/"; 24 | 25 | @Test 26 | public void testUrlFilters() { 27 | Map conf = new HashMap(); 28 | conf.put(CrawlerConstants.URL_FILTERS_FILE, "urlfilters.json"); 29 | String filtersConfigFile = ConfUtils.getString(conf, CrawlerConstants.URL_FILTERS_FILE); 30 | URLFilters filters = UrlFilterUtils.load(conf, filtersConfigFile); 31 | URL sourceUrl; 32 | try { 33 | sourceUrl = new URL(testSourceUrl); 34 | } catch (MalformedURLException e) { 35 | // we would have known by now as previous components check whether the URL is valid 36 | LOG.error("MalformedURLException on {}", testSourceUrl); 37 | return; 38 | } 39 | // test good URL 40 | assertNotNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl, filters)); 41 | // test on bad URL 42 | assertNull(null, UrlFilterUtils.firstMatch(sourceUrl, new Metadata(), testSourceUrl.concat("song.mp3"), filters)); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /crawler/src/test/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpoutTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.crawler.spout; 2 | 3 | import lt.tokenmill.crawling.crawler.DefaultServiceProvider; 4 | import org.junit.Test; 5 | 6 | public class UrlGeneratorSpoutTest { 7 | 8 | 9 | @Test 10 | public void test() { 11 | UrlGeneratorSpout spout = new UrlGeneratorSpout(new DefaultServiceProvider()); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /data-model/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | crawling-framework 7 | lt.tokenmill.crawling 8 | 0.3.4-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | data-model 13 | 14 | 15 | 16 | 17 | joda-time 18 | joda-time 19 | 20 | 21 | com.google.guava 22 | guava 23 | 24 | 25 | junit 26 | junit 27 | 4.13.1 28 | test 29 | 30 | 31 | 32 | 33 | 34 | release 35 | 36 | 37 | 38 | org.apache.maven.plugins 39 | maven-source-plugin 40 | 41 | 42 | 43 | org.apache.maven.plugins 44 | maven-jar-plugin 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-javadoc-plugin 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/DataUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import com.google.common.base.Splitter; 4 | import org.joda.time.DateTime; 5 | import org.joda.time.DateTimeZone; 6 | import org.joda.time.format.DateTimeFormat; 7 | import org.joda.time.format.DateTimeFormatter; 8 | 9 | import java.io.Serializable; 10 | import java.util.List; 11 | import java.util.stream.Collectors; 12 | 13 | public class DataUtils implements Serializable { 14 | 15 | private static final DateTimeFormatter FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss"); 16 | 17 | public static Integer tryParseInteger(Object object) { 18 | try { 19 | return (object != null) ? Integer.parseInt(object.toString()) : null; 20 | } catch (NumberFormatException e) { 21 | } 22 | return null; 23 | } 24 | 25 | public static Long tryParseLong(Object object) { 26 | try { 27 | return (object != null) ? Long.parseLong(object.toString()) : null; 28 | } catch (NumberFormatException e) { 29 | } 30 | return null; 31 | } 32 | 33 | public static List parseStringList(Object object) { 34 | if (object == null) { 35 | return null; 36 | } 37 | return Splitter.onPattern("(?:\r?\n)+") 38 | .splitToList(object.toString()) 39 | .stream() 40 | .map(String::trim) 41 | .filter(s -> !s.isEmpty()) 42 | .collect(Collectors.toList()); 43 | } 44 | 45 | public static String formatInUTC(DateTime date) { 46 | return date != null ? FORMATTER.print(date.toDateTime(DateTimeZone.UTC)) : null; 47 | } 48 | 49 | public static DateTime parseFromUTC(String date) { 50 | return date != null ? FORMATTER.parseDateTime(date) : null; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HighlightedSearchResult.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | public class HighlightedSearchResult implements Serializable { 7 | 8 | private HttpArticle article; 9 | 10 | private List highlights; 11 | 12 | public HighlightedSearchResult(HttpArticle article, List highlights) { 13 | this.article = article; 14 | this.highlights = highlights; 15 | } 16 | 17 | public HttpArticle getArticle() { 18 | return article; 19 | } 20 | 21 | public void setArticle(HttpArticle article) { 22 | this.article = article; 23 | } 24 | 25 | public List getHighlights() { 26 | return highlights; 27 | } 28 | 29 | public void setHighlights(List highlights) { 30 | this.highlights = highlights; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HtmlAnalysisResult.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | public class HtmlAnalysisResult implements Serializable { 8 | 9 | private String url; 10 | private String title; 11 | private Integer httpStatus; 12 | private List links; 13 | private Map headers; 14 | private List metaValues; 15 | 16 | private String robotsTxt; 17 | private Boolean robotsAllowedAll; 18 | private Boolean robotsAllowedNone; 19 | private Boolean robotsAllowedHome; 20 | private List robotsSitemaps; 21 | private Long robotsCrawlDelay; 22 | 23 | public String getUrl() { 24 | return url; 25 | } 26 | 27 | public void setUrl(String url) { 28 | this.url = url; 29 | } 30 | 31 | public String getTitle() { 32 | return title; 33 | } 34 | 35 | public void setTitle(String title) { 36 | this.title = title; 37 | } 38 | 39 | public List getLinks() { 40 | return links; 41 | } 42 | 43 | public void setLinks(List links) { 44 | this.links = links; 45 | } 46 | 47 | public List getMetaValues() { 48 | return metaValues; 49 | } 50 | 51 | public void setMetaValues(List metaValues) { 52 | this.metaValues = metaValues; 53 | } 54 | 55 | public String getRobotsTxt() { 56 | return robotsTxt; 57 | } 58 | 59 | public void setRobotsTxt(String robotsTxt) { 60 | this.robotsTxt = robotsTxt; 61 | } 62 | 63 | public Boolean getRobotsAllowedAll() { 64 | return robotsAllowedAll; 65 | } 66 | 67 | public void setRobotsAllowedAll(Boolean robotsAllowedAll) { 68 | this.robotsAllowedAll = robotsAllowedAll; 69 | } 70 | 71 | public Boolean getRobotsAllowedNone() { 72 | return robotsAllowedNone; 73 | } 74 | 75 | public void setRobotsAllowedNone(Boolean robotsAllowedNone) { 76 | this.robotsAllowedNone = robotsAllowedNone; 77 | } 78 | 79 | public Boolean getRobotsAllowedHome() { 80 | return robotsAllowedHome; 81 | } 82 | 83 | public void setRobotsAllowedHome(Boolean robotsAllowedHome) { 84 | this.robotsAllowedHome = robotsAllowedHome; 85 | } 86 | 87 | public List getRobotsSitemaps() { 88 | return robotsSitemaps; 89 | } 90 | 91 | public void setRobotsSitemaps(List robotsSitemaps) { 92 | this.robotsSitemaps = robotsSitemaps; 93 | } 94 | 95 | public Long getRobotsCrawlDelay() { 96 | return robotsCrawlDelay; 97 | } 98 | 99 | public void setRobotsCrawlDelay(Long robotsCrawlDelay) { 100 | this.robotsCrawlDelay = robotsCrawlDelay; 101 | } 102 | 103 | public Integer getHttpStatus() { 104 | return httpStatus; 105 | } 106 | 107 | public void setHttpStatus(Integer httpStatus) { 108 | this.httpStatus = httpStatus; 109 | } 110 | 111 | public Map getHeaders() { 112 | return headers; 113 | } 114 | 115 | public void setHeaders(Map headers) { 116 | this.headers = headers; 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticle.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | 4 | import org.joda.time.DateTime; 5 | 6 | import java.io.Serializable; 7 | import java.util.List; 8 | 9 | public class HttpArticle implements Serializable { 10 | 11 | private String source; 12 | 13 | private String language; 14 | 15 | private String url; 16 | 17 | private String title; 18 | 19 | private String text; 20 | 21 | private String textSignature; 22 | 23 | private List appIds; 24 | 25 | private DateTime published; 26 | 27 | private DateTime discovered; 28 | 29 | private List categories; 30 | 31 | public String getSource() { 32 | return source; 33 | } 34 | 35 | public String getUrl() { 36 | return url; 37 | } 38 | 39 | public void setSource(String source) { 40 | this.source = source; 41 | } 42 | 43 | public DateTime getPublished() { 44 | return published; 45 | } 46 | 47 | public void setPublished(DateTime published) { 48 | this.published = published; 49 | } 50 | 51 | public DateTime getDiscovered() { 52 | return discovered; 53 | } 54 | 55 | public void setDiscovered(DateTime discovered) { 56 | this.discovered = discovered; 57 | } 58 | 59 | public void setUrl(String url) { 60 | this.url = url; 61 | } 62 | 63 | public String getTitle() { 64 | return title; 65 | } 66 | 67 | public void setTitle(String title) { 68 | this.title = title; 69 | } 70 | 71 | public String getText() { 72 | return text; 73 | } 74 | 75 | public void setText(String text) { 76 | this.text = text; 77 | } 78 | 79 | public List getAppIds() { 80 | return appIds; 81 | } 82 | 83 | public void setAppIds(List appIds) { 84 | this.appIds = appIds; 85 | } 86 | 87 | public List getCategories() { 88 | return categories; 89 | } 90 | 91 | public void setCategories(List categories) { 92 | this.categories = categories; 93 | } 94 | 95 | public String getLanguage() { 96 | return language; 97 | } 98 | 99 | public void setLanguage(String language) { 100 | this.language = language; 101 | } 102 | 103 | public String getTextSignature() { 104 | return textSignature; 105 | } 106 | 107 | public void setTextSignature(String textSignature) { 108 | this.textSignature = textSignature; 109 | } 110 | 111 | @Override 112 | public String toString() { 113 | return "HttpArticle{" + 114 | "source='" + source + '\'' + 115 | ", language='" + language + '\'' + 116 | ", url='" + url + '\'' + 117 | ", title='" + title + '\'' + 118 | ", text='" + text + '\'' + 119 | ", textSignature='" + textSignature + '\'' + 120 | ", appIds=" + appIds + 121 | ", published=" + published + 122 | ", discovered=" + discovered + 123 | ", categories=" + categories + 124 | '}'; 125 | } 126 | } -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HttpArticleParseResult.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | public class HttpArticleParseResult implements Serializable { 8 | 9 | private HttpArticle article; 10 | 11 | private List titleMatches; 12 | 13 | private List textMatches; 14 | 15 | private List publishedTexts; 16 | 17 | private List publishedMatches; 18 | 19 | private String publishedPattern; 20 | 21 | public HttpArticleParseResult() { 22 | } 23 | 24 | public HttpArticleParseResult(HttpArticle article) { 25 | this.article = article; 26 | } 27 | 28 | public HttpArticle getArticle() { 29 | return article; 30 | } 31 | 32 | public void setArticle(HttpArticle article) { 33 | this.article = article; 34 | } 35 | 36 | public List getTitleMatches() { 37 | return titleMatches != null ? titleMatches : Collections.emptyList(); 38 | } 39 | 40 | public void setTitleMatches(List titleMatches) { 41 | this.titleMatches = titleMatches; 42 | } 43 | 44 | public List getTextMatches() { 45 | return textMatches != null ? textMatches : Collections.emptyList(); 46 | } 47 | 48 | public void setTextMatches(List textMatches) { 49 | this.textMatches = textMatches; 50 | } 51 | 52 | public List getPublishedTexts() { 53 | return publishedTexts != null ? publishedTexts : Collections.emptyList(); 54 | } 55 | 56 | public void setPublishedTexts(List publishedTexts) { 57 | this.publishedTexts = publishedTexts; 58 | } 59 | 60 | public List getPublishedMatches() { 61 | return publishedMatches != null ? publishedMatches : Collections.emptyList(); 62 | } 63 | 64 | public void setPublishedMatches(List publishedMatches) { 65 | this.publishedMatches = publishedMatches; 66 | } 67 | 68 | public String getPublishedPattern() { 69 | return publishedPattern; 70 | } 71 | 72 | public void setPublishedPattern(String publishedPattern) { 73 | this.publishedPattern = publishedPattern; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HttpSourceTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | 5 | public class HttpSourceTest implements Serializable { 6 | 7 | private String source; 8 | 9 | private String url; 10 | 11 | private Boolean urlAccepted; 12 | 13 | private String html; 14 | 15 | private String title; 16 | 17 | private String text; 18 | 19 | private String date; 20 | 21 | public String getSource() { 22 | return source; 23 | } 24 | 25 | public void setSource(String source) { 26 | this.source = source; 27 | } 28 | 29 | public String getUrl() { 30 | return url; 31 | } 32 | 33 | public void setUrl(String url) { 34 | this.url = url; 35 | } 36 | 37 | public Boolean getUrlAccepted() { 38 | return urlAccepted; 39 | } 40 | 41 | public void setUrlAccepted(Boolean urlAccepted) { 42 | this.urlAccepted = urlAccepted; 43 | } 44 | 45 | public String getHtml() { 46 | return html; 47 | } 48 | 49 | public void setHtml(String html) { 50 | this.html = html; 51 | } 52 | 53 | public String getTitle() { 54 | return title; 55 | } 56 | 57 | public void setTitle(String title) { 58 | this.title = title; 59 | } 60 | 61 | public String getText() { 62 | return text; 63 | } 64 | 65 | public void setText(String text) { 66 | this.text = text; 67 | } 68 | 69 | public String getDate() { 70 | return date; 71 | } 72 | 73 | public void setDate(String date) { 74 | this.date = date; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/HttpUrl.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import org.joda.time.DateTime; 4 | 5 | import java.io.Serializable; 6 | 7 | public class HttpUrl implements Serializable { 8 | 9 | /** 10 | * Source url. 11 | */ 12 | private String source; 13 | 14 | private String url; 15 | 16 | /** 17 | * Publish data when it is known before parsing. Eg. from RSS feed. 18 | */ 19 | private String published; 20 | 21 | /** 22 | * When this url was dicovered. 23 | */ 24 | private DateTime discovered; 25 | 26 | public String getSource() { 27 | return source; 28 | } 29 | 30 | public void setSource(String source) { 31 | this.source = source; 32 | } 33 | 34 | public String getUrl() { 35 | return url; 36 | } 37 | 38 | public void setUrl(String url) { 39 | this.url = url; 40 | } 41 | 42 | public String getPublished() { 43 | return published; 44 | } 45 | 46 | public void setPublished(String published) { 47 | this.published = published; 48 | } 49 | 50 | public DateTime getDiscovered() { 51 | return discovered; 52 | } 53 | 54 | public void setDiscovered(DateTime discovered) { 55 | this.discovered = discovered; 56 | } 57 | } -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/NamedQuery.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | 5 | public class NamedQuery implements Serializable { 6 | 7 | private String name; 8 | 9 | private String stemmedCaseSensitive; 10 | private String stemmedCaseInSensitive; 11 | private String notStemmedCaseSensitive; 12 | private String notStemmedCaseInSensitive; 13 | private String advanced; 14 | 15 | public String getName() { 16 | return name; 17 | } 18 | 19 | public void setName(String name) { 20 | this.name = name; 21 | } 22 | 23 | public String getStemmedCaseSensitive() { 24 | return stemmedCaseSensitive; 25 | } 26 | 27 | public void setStemmedCaseSensitive(String stemmedCaseSensitive) { 28 | this.stemmedCaseSensitive = stemmedCaseSensitive; 29 | } 30 | 31 | public String getStemmedCaseInSensitive() { 32 | return stemmedCaseInSensitive; 33 | } 34 | 35 | public void setStemmedCaseInSensitive(String stemmedCaseInSensitive) { 36 | this.stemmedCaseInSensitive = stemmedCaseInSensitive; 37 | } 38 | 39 | public String getNotStemmedCaseSensitive() { 40 | return notStemmedCaseSensitive; 41 | } 42 | 43 | public void setNotStemmedCaseSensitive(String notStemmedCaseSensitive) { 44 | this.notStemmedCaseSensitive = notStemmedCaseSensitive; 45 | } 46 | 47 | public String getNotStemmedCaseInSensitive() { 48 | return notStemmedCaseInSensitive; 49 | } 50 | 51 | public void setNotStemmedCaseInSensitive(String notStemmedCaseInSensitive) { 52 | this.notStemmedCaseInSensitive = notStemmedCaseInSensitive; 53 | } 54 | 55 | public String getAdvanced() { 56 | return advanced; 57 | } 58 | 59 | public void setAdvanced(String advanced) { 60 | this.advanced = advanced; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /data-model/src/main/java/lt/tokenmill/crawling/data/PageableList.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | public class PageableList implements Serializable { 7 | 8 | private long totalCount; 9 | 10 | private List items; 11 | 12 | public long getTotalCount() { 13 | return totalCount; 14 | } 15 | 16 | public void setTotalCount(long totalCount) { 17 | this.totalCount = totalCount; 18 | } 19 | 20 | public List getItems() { 21 | return items; 22 | } 23 | 24 | public void setItems(List items) { 25 | this.items = items; 26 | } 27 | 28 | public static PageableList create(List items, long totalCount) { 29 | PageableList pageableList = new PageableList<>(); 30 | pageableList.setItems(items); 31 | pageableList.setTotalCount(totalCount); 32 | return pageableList; 33 | } 34 | } -------------------------------------------------------------------------------- /data-model/src/test/java/lt/tokenmill/crawling/data/DataUtilsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.data; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.joda.time.DateTime; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | 9 | public class DataUtilsTest { 10 | 11 | @Test 12 | public void normalizerSplitter() { 13 | assertEquals(Lists.newArrayList("\\?.*$-->>", "a-->>b"), 14 | DataUtils.parseStringList("\\?.*$-->>\na-->>b\r\r\n\n")); 15 | } 16 | 17 | @Test 18 | public void dateFormatInUTC() { 19 | Long DATE_2017_01_04_12_26_00 = 1483532760805L; 20 | assertEquals("2017-01-04T12:26:00", DataUtils.formatInUTC(new DateTime(DATE_2017_01_04_12_26_00))); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | elasticsearch: 5 | image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest 6 | ports: ["9200:9200"] 7 | environment: 8 | discovery.type: single-node 9 | kibana: 10 | image: docker.elastic.co/kibana/kibana-oss:6.3.0 11 | ports: ["5601:5601"] 12 | environment: 13 | SERVER_NAME: kibana 14 | ELASTICSEARCH_URL: http://elasticsearch:9200 15 | -------------------------------------------------------------------------------- /docker-compose.run.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | elasticsearch: 5 | image: registry.gitlab.com/tokenmill/crawling-framework/elasticsearch:latest 6 | ports: ["9200:9200"] 7 | environment: 8 | discovery.type: single-node 9 | kibana: 10 | image: docker.elastic.co/kibana/kibana-oss:6.3.0 11 | ports: ["5601:5601"] 12 | environment: 13 | SERVER_NAME: kibana 14 | ELASTICSEARCH_URL: http://elasticsearch:9200 15 | administration-ui: 16 | image: registry.gitlab.com/tokenmill/crawling-framework/ui:latest 17 | ports: ["8081:8081"] 18 | crawler: 19 | image: registry.gitlab.com/tokenmill/crawling-framework/crawler:latest 20 | environment: 21 | DEFAULT_SOURCE_RELOAD_DELAY: 10 22 | -------------------------------------------------------------------------------- /elasticsearch/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | crawling-framework 7 | lt.tokenmill.crawling 8 | 0.3.4-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | elasticsearch 13 | 14 | 15 | 16 | 17 | lt.tokenmill.crawling 18 | data-model 19 | 20 | 21 | org.elasticsearch 22 | elasticsearch 23 | 24 | 25 | org.elasticsearch.client 26 | transport 27 | 28 | 29 | org.elasticsearch.client 30 | elasticsearch-rest-client 31 | ${elasticsearch.version} 32 | 33 | 34 | org.elasticsearch.client 35 | elasticsearch-rest-high-level-client 36 | ${elasticsearch.version} 37 | 38 | 39 | org.apache.httpcomponents 40 | httpasyncclient 41 | 4.1.3 42 | 43 | 44 | org.apache.httpcomponents 45 | httpcore-nio 46 | 4.4.6 47 | 48 | 49 | org.apache.httpcomponents 50 | httpclient 51 | 4.5.4 52 | 53 | 54 | org.apache.httpcomponents 55 | httpcore 56 | 4.4.6 57 | 58 | 59 | org.elasticsearch.plugin 60 | transport-netty4-client 61 | ${elasticsearch.version} 62 | test 63 | 64 | 65 | com.google.guava 66 | guava 67 | 68 | 69 | org.apache.logging.log4j 70 | log4j-api 71 | 2.7 72 | provided 73 | 74 | 75 | org.apache.logging.log4j 76 | log4j-core 77 | 2.13.2 78 | provided 79 | 80 | 81 | org.slf4j 82 | slf4j-log4j12 83 | ${slf4j.version} 84 | provided 85 | 86 | 87 | junit 88 | junit 89 | 4.13.1 90 | test 91 | 92 | 93 | 94 | 95 | 96 | release 97 | 98 | 99 | 100 | org.apache.maven.plugins 101 | maven-source-plugin 102 | 103 | 104 | 105 | org.apache.maven.plugins 106 | maven-jar-plugin 107 | 108 | 109 | 110 | org.apache.maven.plugins 111 | maven-javadoc-plugin 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import org.elasticsearch.client.RequestOptions; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.net.URLEncoder; 8 | import java.util.UUID; 9 | 10 | public class BaseElasticOps { 11 | 12 | protected final Logger LOG = LoggerFactory.getLogger(this.getClass()); 13 | 14 | private final RequestOptions requestOptions; 15 | private ElasticConnection connection; 16 | private String index; 17 | private String type; 18 | 19 | protected BaseElasticOps(ElasticConnection connection, String index, String type) { 20 | this.connection = connection; 21 | this.index = index; 22 | this.type = type; 23 | requestOptions = RequestOptions.DEFAULT; 24 | } 25 | 26 | protected ElasticConnection getConnection() { 27 | return connection; 28 | } 29 | 30 | protected String getIndex() { 31 | return index; 32 | } 33 | 34 | protected String getType() { 35 | return type; 36 | } 37 | 38 | protected RequestOptions getRequestOptions() { return requestOptions; } 39 | 40 | public void close() { 41 | if (connection != null) { 42 | connection.close(); 43 | } 44 | } 45 | 46 | protected static String formatId(String url) { 47 | try { 48 | String urlId = URLEncoder.encode(url.toLowerCase(), "utf-8"); 49 | if (urlId.length() > 511) { 50 | urlId = urlId.substring(0, 511); 51 | } 52 | return urlId; 53 | } catch (Exception e) { 54 | e.printStackTrace(); 55 | } 56 | return UUID.randomUUID().toString(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/ElasticConstants.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | public class ElasticConstants { 4 | 5 | public static final String ES_HOSTNAME_PARAM = "es.hostname"; 6 | public static final String ES_REST_PORT = "es.rest.port"; 7 | public static final String ES_REST_SCHEME = "es.rest.scheme"; 8 | 9 | public static final String ES_URLS_INDEX_NAME_PARAM = "es.urls.index.name"; 10 | public static final String ES_URLS_DOC_TYPE_PARAM = "es.urls.doc.type"; 11 | 12 | public static final String ES_DOCS_INDEX_NAME_PARAM = "es.docs.index.name"; 13 | public static final String ES_DOCS_DOC_TYPE_PARAM = "es.docs.doc.type"; 14 | 15 | public static final String ES_HTTP_SOURCES_INDEX_NAME_PARAM = "es.httpsource.index.name"; 16 | public static final String ES_HTTP_SOURCES_DOC_TYPE_PARAM = "es.httpsource.doc.type"; 17 | 18 | public static final String ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM = "es.httpsourcetest.index.name"; 19 | public static final String ES_HTTP_SOURCES_TEST_TYPE_PARAM = "es.httpsourcetest.doc.type"; 20 | 21 | public static final String ES_NAMED_QUERIES_INDEX_PARAM = "es.namedqueries.index.name"; 22 | public static final String ES_NAMED_QUERIES_TYPE_PARAM = "es.namedqueries.doc.type"; 23 | 24 | } 25 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDataParser.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.joda.time.DateTime; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.text.ParseException; 9 | import java.text.SimpleDateFormat; 10 | import java.util.List; 11 | import java.util.TimeZone; 12 | 13 | class EsDataParser { 14 | 15 | private static final Logger LOG = LoggerFactory.getLogger(EsDataParser.class); 16 | 17 | 18 | private static final List ES_DATE_TIME_FORMATS = Lists.newArrayList( 19 | "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", 20 | "yyyy-MM-dd'T'HH:mm:ss'Z'" 21 | ); 22 | 23 | static DateTime nullOrDate(Object object) { 24 | if (object != null) { 25 | DateTime result = null; 26 | for (String format : ES_DATE_TIME_FORMATS) { 27 | SimpleDateFormat formatter = new SimpleDateFormat(format); 28 | formatter.setTimeZone(TimeZone.getTimeZone("UTC")); 29 | try { 30 | result = new DateTime(formatter.parse(object.toString())); 31 | break; 32 | } catch (ParseException ignored) { 33 | } 34 | } 35 | if (result == null) { 36 | LOG.error("Failed to parse date from '{}'", object); 37 | } 38 | return result; 39 | } 40 | return null; 41 | } 42 | 43 | static boolean falseOrBoolean(Object object) { 44 | return (object != null) && Boolean.parseBoolean(object.toString()); 45 | } 46 | } -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourcesCache.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import com.google.common.cache.CacheBuilder; 4 | import com.google.common.cache.CacheLoader; 5 | import com.google.common.cache.LoadingCache; 6 | import lt.tokenmill.crawling.data.HttpSource; 7 | 8 | import java.util.concurrent.ExecutionException; 9 | import java.util.concurrent.TimeUnit; 10 | 11 | public class EsHttpSourcesCache { 12 | 13 | 14 | private static LoadingCache INSTANCE; 15 | 16 | private static synchronized LoadingCache getInstance( 17 | final EsHttpSourceOperations operations) { 18 | if (INSTANCE == null) { 19 | INSTANCE = CacheBuilder.newBuilder() 20 | .maximumSize(1000) 21 | .expireAfterWrite(10, TimeUnit.MINUTES) 22 | .build(new CacheLoader() { 23 | public HttpSource load(String url) { 24 | return operations.get(url); 25 | } 26 | }); 27 | } 28 | return INSTANCE; 29 | } 30 | 31 | public static HttpSource get(EsHttpSourceOperations operations, String source) { 32 | try { 33 | return getInstance(operations).get(source); 34 | } catch (ExecutionException e) { 35 | throw new RuntimeException(e); 36 | } 37 | } 38 | 39 | public static void invalidate() { 40 | if (INSTANCE != null) { 41 | INSTANCE.invalidateAll(); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/Utils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import com.google.common.base.Joiner; 4 | import com.google.common.base.Splitter; 5 | import org.joda.time.DateTime; 6 | 7 | import java.util.Collection; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Objects; 11 | import java.util.stream.Collectors; 12 | 13 | import static com.google.common.base.Strings.isNullOrEmpty; 14 | 15 | public class Utils { 16 | 17 | private static final Splitter LINE_SPLITTER = Splitter.on('\n'); 18 | private static final Joiner LINE_JOINER = Joiner.on('\n'); 19 | 20 | public static List linesToList(String text) { 21 | return LINE_SPLITTER.splitToList(text).stream() 22 | .map(String::trim) 23 | .filter(l -> !isNullOrEmpty(l)) 24 | .collect(Collectors.toList()); 25 | } 26 | 27 | public static String listToText(List lines) { 28 | return lines != null ? LINE_JOINER.join(lines) : ""; 29 | } 30 | 31 | public static Object formatFieldValue(Object value) { 32 | if (value == null) { 33 | return null; 34 | } 35 | if (value instanceof List) { 36 | List v = (List) value; 37 | if (!v.isEmpty() && (v.get(0) instanceof Map)) { 38 | return v; 39 | } 40 | return listToText((List) value); 41 | } else if (value instanceof DateTime) { 42 | return ((DateTime) value).toDate(); 43 | } else if (value instanceof Enum) { 44 | return Objects.toString(value, null); 45 | } else { 46 | return value; 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/lt/tokenmill/crawling/es/model/DateHistogramValue.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es.model; 2 | 3 | public class DateHistogramValue { 4 | 5 | private Long value; 6 | 7 | private String date; 8 | 9 | public DateHistogramValue(String date, Long value) { 10 | this.value = value; 11 | this.date = date; 12 | } 13 | 14 | public Long getValue() { 15 | return value; 16 | } 17 | 18 | public void setValue(Long value) { 19 | this.value = value; 20 | } 21 | 22 | public String getDate() { 23 | return date; 24 | } 25 | 26 | public void setDate(String date) { 27 | this.date = date; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/indices/document.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 1, 4 | "number_of_replicas": 0, 5 | "index": { 6 | "codec": "best_compression" 7 | }, 8 | "analysis": { 9 | "filter": { 10 | "english_stop": { 11 | "type": "stop", 12 | "stopwords": "_english_" 13 | }, 14 | "english_light_stemmer": { 15 | "type": "stemmer", 16 | "language": "light_english" 17 | }, 18 | "english_possessive_stemmer": { 19 | "type": "stemmer", 20 | "language": "possessive_english" 21 | } 22 | }, 23 | "analyzer": { 24 | "english_stem_cs": { 25 | "tokenizer": "standard", 26 | "filter": [ 27 | "english_possessive_stemmer", 28 | "english_stop", 29 | "english_light_stemmer" 30 | ] 31 | }, 32 | "english_stem_ci": { 33 | "tokenizer": "standard", 34 | "filter": [ 35 | "english_possessive_stemmer", 36 | "lowercase", 37 | "english_stop", 38 | "english_light_stemmer" 39 | ] 40 | }, 41 | "english_nostem_cs": { 42 | "tokenizer": "standard", 43 | "filter": [ 44 | "english_possessive_stemmer", 45 | "english_stop" 46 | ] 47 | }, 48 | "english_nostem_ci": { 49 | "tokenizer": "standard", 50 | "filter": [ 51 | "english_possessive_stemmer", 52 | "lowercase", 53 | "english_stop" 54 | ] 55 | } 56 | } 57 | } 58 | }, 59 | "mappings": { 60 | "doc": { 61 | "_source": { 62 | "enabled": true 63 | }, 64 | "properties": { 65 | "created": { 66 | "type": "date" 67 | }, 68 | "published": { 69 | "type": "date" 70 | }, 71 | "discovered": { 72 | "type": "date" 73 | }, 74 | "updated": { 75 | "type": "date" 76 | }, 77 | "url": { 78 | "type": "keyword" 79 | }, 80 | "source": { 81 | "type": "keyword" 82 | }, 83 | "language": { 84 | "type": "keyword" 85 | }, 86 | "status": { 87 | "type": "keyword" 88 | }, 89 | "app_ids": { 90 | "type": "keyword" 91 | }, 92 | "categories": { 93 | "type": "keyword" 94 | }, 95 | "title": { 96 | "type": "text", 97 | "index": true, 98 | "doc_values": false, 99 | "fielddata": true, 100 | "fields": { 101 | "stem_cs": { 102 | "type": "text", 103 | "index": true, 104 | "analyzer": "english_stem_cs" 105 | }, 106 | "stem_ci": { 107 | "type": "text", 108 | "index": true, 109 | "analyzer": "english_stem_ci" 110 | }, 111 | "nostem_cs": { 112 | "type": "text", 113 | "index": true, 114 | "analyzer": "english_nostem_cs" 115 | }, 116 | "nostem_ci": { 117 | "type": "text", 118 | "index": true, 119 | "analyzer": "english_nostem_ci" 120 | } 121 | } 122 | }, 123 | "text": { 124 | "type": "text", 125 | "doc_values": false, 126 | "fielddata": true, 127 | "fields": { 128 | "stem_cs": { 129 | "type": "text", 130 | "index": true, 131 | "analyzer": "english_stem_cs" 132 | }, 133 | "stem_ci": { 134 | "type": "text", 135 | "index": true, 136 | "analyzer": "english_stem_ci" 137 | }, 138 | "nostem_cs": { 139 | "type": "text", 140 | "index": true, 141 | "analyzer": "english_nostem_cs" 142 | }, 143 | "nostem_ci": { 144 | "type": "text", 145 | "index": true, 146 | "analyzer": "english_nostem_ci" 147 | } 148 | } 149 | }, 150 | "text_signature": { 151 | "type": "keyword" 152 | }, 153 | "duplicate_of": { 154 | "type": "keyword" 155 | } 156 | } 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/indices/http_source.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 1, 4 | "number_of_replicas": 0, 5 | "index": { 6 | "codec": "best_compression" 7 | } 8 | }, 9 | "mappings": { 10 | "http_source": { 11 | "_source": { 12 | "enabled": true 13 | }, 14 | "properties": { 15 | "created": { 16 | "type": "date", 17 | "format": "date_optional_time" 18 | }, 19 | "updated": { 20 | "type": "date", 21 | "format": "date_optional_time" 22 | }, 23 | "url": { 24 | "type": "keyword", 25 | "copy_to": "search_field" 26 | }, 27 | "name": { 28 | "type": "keyword", 29 | "copy_to": "search_field" 30 | }, 31 | "timezone": { 32 | "type": "keyword" 33 | }, 34 | "language": { 35 | "type": "keyword" 36 | }, 37 | "url_crawl_delay_secs": { 38 | "type": "integer" 39 | }, 40 | "feed_crawl_delay_secs": { 41 | "type": "integer" 42 | }, 43 | "sitemap_crawl_delay_secs": { 44 | "type": "integer" 45 | }, 46 | "enabled": { 47 | "type": "boolean" 48 | }, 49 | "discovery_enabled": { 50 | "type": "boolean" 51 | }, 52 | "urls": { 53 | "type": "keyword", 54 | "copy_to": "search_field" 55 | }, 56 | "sitemaps": { 57 | "type": "keyword", 58 | "copy_to": "search_field" 59 | }, 60 | "feeds": { 61 | "type": "keyword", 62 | "copy_to": "search_field" 63 | }, 64 | "countries": { 65 | "type": "keyword" 66 | }, 67 | "categories": { 68 | "type": "keyword" 69 | }, 70 | "app_ids": { 71 | "type": "keyword" 72 | }, 73 | "url_filters": { 74 | "type": "keyword" 75 | }, 76 | "url_normalizers": { 77 | "type": "keyword" 78 | }, 79 | "title_selectors": { 80 | "type": "keyword" 81 | }, 82 | "date_selectors": { 83 | "type": "keyword" 84 | }, 85 | "text_selectors": { 86 | "type": "keyword" 87 | }, 88 | "text_normalizers": { 89 | "type": "keyword" 90 | }, 91 | "date_regexps": { 92 | "type": "keyword" 93 | }, 94 | "date_formats": { 95 | "type": "keyword" 96 | }, 97 | "search_field": { 98 | "type": "text" 99 | } 100 | } 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/indices/http_source_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 1, 4 | "number_of_replicas": 0, 5 | "index": { 6 | "codec": "best_compression" 7 | } 8 | }, 9 | "mappings": { 10 | "http_source_test": { 11 | "_source": { 12 | "enabled": true 13 | }, 14 | "properties": { 15 | "updated": { 16 | "type": "date", 17 | "format": "date_optional_time" 18 | }, 19 | "source_url": { 20 | "type": "keyword", 21 | "copy_to": "search_field" 22 | }, 23 | "url": { 24 | "type": "keyword", 25 | "copy_to": "search_field" 26 | }, 27 | "url_accepted": { 28 | "type": "boolean", 29 | "doc_values": false 30 | }, 31 | "html": { 32 | "type": "keyword", 33 | "index": false, 34 | "doc_values": false 35 | }, 36 | "title": { 37 | "type": "keyword", 38 | "index": false, 39 | "doc_values": false 40 | }, 41 | "text": { 42 | "type": "keyword", 43 | "index": false, 44 | "doc_values": false 45 | }, 46 | "date": { 47 | "type": "keyword", 48 | "index": false, 49 | "doc_values": false 50 | }, 51 | "search_field": { 52 | "type": "text" 53 | } 54 | } 55 | } 56 | } 57 | } -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/indices/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 1, 4 | "number_of_replicas": 0, 5 | "index": { 6 | "codec": "best_compression" 7 | } 8 | }, 9 | "mappings": { 10 | "named_query": { 11 | "_source": { 12 | "enabled": true 13 | }, 14 | "properties": { 15 | "updated": { 16 | "type": "date", 17 | "format": "date_optional_time" 18 | }, 19 | "name": { 20 | "type": "keyword" 21 | }, 22 | "name_suggest": { 23 | "type": "completion" 24 | }, 25 | "stemmed_case_sensitive": { 26 | "type": "keyword" 27 | }, 28 | "stemmed_case_insensitive": { 29 | "type": "keyword" 30 | }, 31 | "not_stemmed_case_sensitive": { 32 | "type": "keyword" 33 | }, 34 | "not_stemmed_case_insensitive": { 35 | "type": "keyword" 36 | }, 37 | "advanced": { 38 | "type": "keyword" 39 | } 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/indices/url.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 1, 4 | "number_of_replicas": 0, 5 | "index": { 6 | "codec": "best_compression" 7 | } 8 | }, 9 | "mappings": { 10 | "url": { 11 | "_source": { 12 | "enabled": true 13 | }, 14 | "properties": { 15 | "created": { 16 | "type": "date" 17 | }, 18 | "updated": { 19 | "type": "date" 20 | }, 21 | "published": { 22 | "type": "date" 23 | }, 24 | "url": { 25 | "type": "keyword" 26 | }, 27 | "source": { 28 | "type": "keyword" 29 | }, 30 | "status": { 31 | "type": "keyword" 32 | } 33 | } 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticConnectionTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import org.apache.http.HttpHost; 4 | import org.elasticsearch.action.DocWriteRequest; 5 | import org.elasticsearch.action.bulk.BulkItemResponse; 6 | import org.elasticsearch.action.bulk.BulkProcessor; 7 | import org.elasticsearch.action.bulk.BulkRequest; 8 | import org.elasticsearch.action.bulk.BulkResponse; 9 | import org.elasticsearch.action.index.IndexRequest; 10 | import org.elasticsearch.action.update.UpdateRequest; 11 | import org.elasticsearch.client.RestClient; 12 | import org.elasticsearch.client.RestClientBuilder; 13 | import org.elasticsearch.client.RestHighLevelClient; 14 | import org.elasticsearch.common.unit.TimeValue; 15 | import org.junit.Test; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import java.io.UnsupportedEncodingException; 20 | import java.net.URLDecoder; 21 | 22 | import static org.junit.Assert.assertNotNull; 23 | 24 | public class ElasticConnectionTest { 25 | private static final Logger LOG = LoggerFactory.getLogger(ElasticConnectionTest.class); 26 | @Test 27 | public void testConnectionBuilder() { 28 | ElasticConnection connection = ElasticConnection.builder().build(); 29 | assertNotNull(connection.getRestHighLevelClient()); 30 | } 31 | 32 | @Test 33 | public void testBuilder() { 34 | BulkProcessor.Listener listener = new BulkProcessor.Listener() { 35 | 36 | @Override 37 | public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { 38 | for (BulkItemResponse item : response.getItems()) { 39 | if (item.isFailed()) { 40 | LOG.error("Bulk item failure: '{}' for request '{}'", 41 | item.getFailure(), request.requests().get(item.getItemId())); 42 | } 43 | } 44 | } 45 | 46 | @Override 47 | public void afterBulk(long executionId, BulkRequest request, Throwable response) { 48 | LOG.error("Bulk failed:" + response); 49 | } 50 | 51 | @Override 52 | public void beforeBulk(long executionId, BulkRequest request) { 53 | for (DocWriteRequest r :request.requests()) { 54 | try { 55 | if (r instanceof IndexRequest) { 56 | IndexRequest indexRequest = (IndexRequest) r; 57 | indexRequest.id(URLDecoder.decode(indexRequest.id(), "utf-8")); 58 | 59 | } else if (r instanceof UpdateRequest) { 60 | UpdateRequest updateRequest = (UpdateRequest) r; 61 | updateRequest.id(URLDecoder.decode(updateRequest.id(), "utf-8")); 62 | } 63 | } catch (UnsupportedEncodingException e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | } 68 | }; 69 | ElasticConnection connection = ElasticConnection.builder() 70 | .hostname("0.0.0.0") 71 | .restPort(443) 72 | .restScheme("https") 73 | .bulkActions(1) 74 | .flushIntervalString("1s") 75 | .listener(listener) 76 | .build(); 77 | assertNotNull(connection); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/ElasticsearchTestServer.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import org.elasticsearch.client.Client; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.node.Node; 7 | import org.elasticsearch.node.NodeValidationException; 8 | import org.elasticsearch.plugins.Plugin; 9 | import org.elasticsearch.transport.Netty4Plugin; 10 | 11 | import java.io.File; 12 | import java.io.IOException; 13 | import java.nio.file.FileVisitOption; 14 | import java.nio.file.Files; 15 | import java.nio.file.Path; 16 | import java.nio.file.Paths; 17 | import java.util.Arrays; 18 | import java.util.Collection; 19 | import java.util.Comparator; 20 | 21 | public class ElasticsearchTestServer { 22 | 23 | private static class MyNode extends Node { 24 | MyNode(Settings preparedSettings, Collection> classpathPlugins) { 25 | super(new Environment(preparedSettings, null), classpathPlugins, false); 26 | } 27 | } 28 | 29 | private final Node node; 30 | private Client client; 31 | 32 | private ElasticsearchTestServer(Builder builder) { 33 | if (builder.cleanDataDir) { 34 | try { 35 | Path rootPath = Paths.get(builder.dataDirectory); 36 | if (Files.exists(rootPath)) { 37 | Files.walk(rootPath, FileVisitOption.FOLLOW_LINKS) 38 | .sorted(Comparator.reverseOrder()) 39 | .map(Path::toFile) 40 | .forEach(File::delete); 41 | } 42 | } catch (IOException e) { 43 | e.printStackTrace(); 44 | } 45 | } 46 | Settings settings = Settings.builder() 47 | .put("client.transport.ignore_cluster_name", true) 48 | .put("transport.type", "netty4") 49 | .put("http.type", "netty4") 50 | .put("http.enabled", "true") 51 | .put("http.port", builder.httpPort) 52 | .put("path.home", builder.dataDirectory) 53 | .put("transport.tcp.port", builder.transportPort) 54 | .build(); 55 | this.node = new MyNode(settings, Arrays.asList(Netty4Plugin.class)); 56 | } 57 | 58 | public void start() { 59 | try { 60 | this.node.start(); 61 | this.client = this.node.client(); 62 | } catch (NodeValidationException e) { 63 | e.printStackTrace(); 64 | } 65 | } 66 | 67 | public void stop() { 68 | try { 69 | this.client.close(); 70 | this.node.close(); 71 | } catch (IOException e) { 72 | e.printStackTrace(); 73 | } 74 | } 75 | 76 | public static Builder builder() { 77 | return new Builder(); 78 | } 79 | 80 | 81 | public static class Builder { 82 | 83 | private boolean cleanDataDir = true; 84 | private String dataDirectory = "target/elasticsearch-data"; 85 | private int httpPort = 9200; 86 | private int transportPort = 9305; 87 | 88 | public Builder httpPort(int httpPort) { 89 | this.httpPort = httpPort; 90 | return this; 91 | } 92 | 93 | public Builder transportPort(int transportPort) { 94 | this.transportPort = transportPort; 95 | return this; 96 | } 97 | 98 | public ElasticsearchTestServer build() { 99 | return new ElasticsearchTestServer(this); 100 | } 101 | 102 | 103 | public Builder dataDirectory(String dataDirectory) { 104 | this.dataDirectory = dataDirectory; 105 | return this; 106 | } 107 | 108 | public Builder cleanDataDir(boolean cleanDataDir) { 109 | this.cleanDataDir = cleanDataDir; 110 | return this; 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsDocumentOperationsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import com.google.common.collect.ImmutableMap; 4 | import lt.tokenmill.crawling.data.HttpArticle; 5 | import org.joda.time.DateTime; 6 | import org.junit.Ignore; 7 | import org.junit.Test; 8 | 9 | import java.util.Arrays; 10 | import java.util.Map; 11 | 12 | import static org.junit.Assert.assertEquals; 13 | import static org.junit.Assert.assertNull; 14 | 15 | public class EsDocumentOperationsTest { 16 | 17 | @Test 18 | @Ignore 19 | public void test() throws InterruptedException { 20 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http"); 21 | EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "demo-docs", "doc"); 22 | HttpArticle article = new HttpArticle(); 23 | article.setUrl("http://www.bbc.com/news/science-environment-43727547"); 24 | article.setTitle("title"); 25 | article.setText("text"); 26 | article.setPublished(DateTime.now()); 27 | 28 | esDocumentOperations.store(article); 29 | 30 | Thread.sleep(6000); 31 | 32 | HttpArticle httpArticle = esDocumentOperations.get(article.getUrl()); 33 | assertEquals(article.getUrl(), httpArticle.getUrl()); 34 | assertEquals(article.getText(), httpArticle.getText()); 35 | 36 | esDocumentOperations.update(article, ImmutableMap.of("TESTKEY", Arrays.asList(ImmutableMap.of("k1", "v1")))); 37 | Thread.sleep(6000); 38 | Map articleMap = esDocumentOperations.getAsMap(article.getUrl()); 39 | assertEquals(article.getText(), articleMap.get("text")); 40 | assertEquals("TESTVAL", articleMap.get("TESTKEY")); 41 | } 42 | 43 | @Test 44 | @Ignore 45 | public void testDuplicateFinder() throws InterruptedException { 46 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http"); 47 | EsDocumentOperations esDocumentOperations = EsDocumentOperations.getInstance(connection, "cf-docs", "doc"); 48 | HttpArticle article = new HttpArticle(); 49 | article.setUrl("url1"); 50 | article.setSource("source"); 51 | article.setTitle("title"); 52 | article.setText("text"); 53 | article.setTextSignature("text_signature"); 54 | article.setPublished(DateTime.now()); 55 | esDocumentOperations.store(article); 56 | Thread.sleep(6000); 57 | HttpArticle duplicate = esDocumentOperations.findDuplicate(article); 58 | assertNull(duplicate); 59 | article.setUrl("url2"); 60 | esDocumentOperations.store(article); 61 | Thread.sleep(6000); 62 | assertEquals("url1", esDocumentOperations.getAsMap("url2").get("duplicate_of")); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpSourceOperationsTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import lt.tokenmill.crawling.data.HttpSource; 4 | import lt.tokenmill.crawling.data.PageableList; 5 | import org.junit.Ignore; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class EsHttpSourceOperationsTest { 11 | 12 | @Test 13 | @Ignore 14 | public void test() { 15 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http"); 16 | EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "demo-http_sources", "http_source"); 17 | PageableList data = esHttpSourceOperations.filter(null); 18 | for (HttpSource source : data.getItems()) { 19 | System.out.println(">>" + source); 20 | } 21 | } 22 | 23 | @Ignore 24 | @Test 25 | public void testRefresh() { 26 | ElasticConnection connection = ElasticConnection.getConnection("localhost", 9200, "http"); 27 | EsHttpSourceOperations esHttpSourceOperations = new EsHttpSourceOperations(connection, "cf-http_sources", "http_source"); 28 | HttpSource source = new HttpSource(); 29 | source.setName("test"); 30 | source.setUrl("url"); 31 | esHttpSourceOperations.save(source); 32 | String currentName = esHttpSourceOperations.get("url").getName(); 33 | assertEquals("test", currentName); 34 | source.setName("new name"); 35 | esHttpSourceOperations.save(source); 36 | String name = esHttpSourceOperations.get("url").getName(); 37 | assertNotEquals("test", name); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/EsHttpUrlOperationsTestInt.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import lt.tokenmill.crawling.data.HttpUrl; 4 | import org.junit.Test; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | import java.util.List; 10 | 11 | import static junit.framework.TestCase.assertTrue; 12 | 13 | public class EsHttpUrlOperationsTestInt { 14 | 15 | private static final Logger LOG = LoggerFactory.getLogger(EsHttpUrlOperationsTestInt.class); 16 | 17 | private static final String ES_TEST_HOST = "elasticsearch"; 18 | private static final int ES_HTTP_TEST_PORT = 9200; 19 | private static final String ES_REST_TEST_SCHEME = "http"; 20 | private static final String INDEX_ALIAS = "urls"; 21 | private static final String DOC_TYPE = "url"; 22 | 23 | 24 | @Test 25 | public void testEsHttpSourceOperations000() throws IOException, InterruptedException { 26 | ElasticConnection connection = ElasticConnection.getConnection(ES_TEST_HOST, ES_HTTP_TEST_PORT, ES_REST_TEST_SCHEME); 27 | EsHttpUrlOperations esHttpUrlOperations = EsHttpUrlOperations.getInstance(connection, INDEX_ALIAS, DOC_TYPE); 28 | 29 | String url = "http://www.bbc.com/news/science-environment-43727547"; 30 | String source = "www.bbc.com"; 31 | esHttpUrlOperations.upsertUrlStatus(url, null, source, true, "a"); 32 | Thread.sleep(6000); 33 | esHttpUrlOperations.upsertUrlStatus(url, null, source, false, "b"); 34 | Thread.sleep(6000); 35 | List urls = esHttpUrlOperations.findUrlsByStatusAndSource("b", source, 10); 36 | assertTrue(urls.size() > 0); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /elasticsearch/src/test/java/lt/tokenmill/crawling/es/TestUtils.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.es; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.nio.charset.StandardCharsets; 6 | import java.nio.file.Files; 7 | import java.nio.file.Paths; 8 | 9 | public class TestUtils { 10 | 11 | public static byte[] readResourceAsBytes(String filename) throws URISyntaxException, IOException { 12 | return Files.readAllBytes(Paths.get(TestUtils.class.getClassLoader().getResource(filename).toURI())); 13 | } 14 | 15 | public static String readResourceAsString(String filename) throws URISyntaxException, IOException { 16 | return new String(readResourceAsBytes(filename), StandardCharsets.UTF_8); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /elasticsearch/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target=System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{5}:%L - %m%n 7 | -------------------------------------------------------------------------------- /elasticsearch/src/test/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | name=PropertiesConfig 2 | property.filename = logs 3 | appenders = console 4 | appender.console.type = Console 5 | appender.console.name = STDOUT 6 | appender.console.layout.type = PatternLayout 7 | appender.console.layout.pattern = [%-5level] %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %c{5} - %msg%n 8 | 9 | rootLogger.level = WARN 10 | rootLogger.appenderRefs = stdout 11 | rootLogger.appenderRef.stdout.ref = STDOUT 12 | 13 | appender.org.elasticsearch = debug -------------------------------------------------------------------------------- /page-analyzer/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | crawling-framework 7 | lt.tokenmill.crawling 8 | 0.3.4-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | page-analyzer 13 | 14 | 15 | 16 | lt.tokenmill.crawling 17 | data-model 18 | ${project.version} 19 | 20 | 21 | org.jsoup 22 | jsoup 23 | 24 | 25 | com.google.guava 26 | guava 27 | 28 | 29 | com.mashape.unirest 30 | unirest-java 31 | 1.4.9 32 | 33 | 34 | com.github.crawler-commons 35 | crawler-commons 36 | 0.7 37 | 38 | 39 | org.slf4j 40 | slf4j-log4j12 41 | ${slf4j.version} 42 | provided 43 | 44 | 45 | junit 46 | junit 47 | 4.13.1 48 | test 49 | 50 | 51 | 52 | 53 | 54 | release 55 | 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-source-plugin 60 | 61 | 62 | 63 | org.apache.maven.plugins 64 | maven-jar-plugin 65 | 66 | 67 | 68 | org.apache.maven.plugins 69 | maven-javadoc-plugin 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /page-analyzer/src/main/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzer.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.pageanalyzer; 2 | 3 | import com.google.common.base.Joiner; 4 | import com.google.common.collect.Maps; 5 | import com.mashape.unirest.http.HttpResponse; 6 | import com.mashape.unirest.http.Unirest; 7 | import com.mashape.unirest.http.exceptions.UnirestException; 8 | import crawlercommons.robots.BaseRobotRules; 9 | import crawlercommons.robots.SimpleRobotRulesParser; 10 | import lt.tokenmill.crawling.data.HtmlAnalysisResult; 11 | import org.jsoup.Jsoup; 12 | import org.jsoup.nodes.Document; 13 | import org.jsoup.nodes.Element; 14 | 15 | import java.net.URL; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.stream.Collectors; 19 | 20 | public class PageAnalyzer { 21 | 22 | private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"; 23 | 24 | public static final String CONFIG_USER_AGENT = "UserAgent"; 25 | public static final String CONFIG_ANALYZE_ROBOTS_TXT = "RobotsTxt"; 26 | 27 | public static HtmlAnalysisResult analyze(Map config, String url) { 28 | try { 29 | String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT); 30 | HttpResponse response = Unirest.get(url) 31 | .header("User-Agent", userAgent) 32 | .asString(); 33 | return analyze(config, url, response.getBody(), response.getStatus(), response.getHeaders()); 34 | } catch (UnirestException e) { 35 | throw new RuntimeException(e); 36 | } 37 | } 38 | 39 | public static HtmlAnalysisResult analyze(Map config, String url, String html) { 40 | return analyze(config, url, html, null, Maps.newHashMap()); 41 | } 42 | 43 | public static HtmlAnalysisResult analyze(Map config, String url, String html, Integer status, Map> headers) { 44 | try { 45 | HtmlAnalysisResult result = new HtmlAnalysisResult(); 46 | result.setUrl(url); 47 | result.setHttpStatus(status); 48 | result.setHeaders(headers.entrySet() 49 | .stream() 50 | .collect(Collectors.toMap(Map.Entry::getKey, e -> Joiner.on("\n").join(e.getValue())))); 51 | 52 | Document document = Jsoup.parse(html, url); 53 | result.setTitle(document.title()); 54 | 55 | List meta = document.select("meta").stream().map(Element::toString).collect(Collectors.toList()); 56 | result.setMetaValues(meta); 57 | 58 | List links = document.select("a").stream().map(e -> e.attr("abs:href")).collect(Collectors.toList()); 59 | result.setLinks(links); 60 | 61 | if (Boolean.parseBoolean(config.get(CONFIG_ANALYZE_ROBOTS_TXT))) { 62 | String robotsUrl = robotsTxtUrl(url); 63 | String userAgent = config.getOrDefault(CONFIG_USER_AGENT, DEFAULT_USER_AGENT); 64 | HttpResponse response = Unirest.get(robotsUrl) 65 | .header("User-Agent", userAgent) 66 | .asString(); 67 | String robotsTxt = response.getBody(); 68 | parseRobotsTxt(userAgent, robotsUrl, robotsTxt, result); 69 | } 70 | return result; 71 | } catch (Exception e) { 72 | throw new RuntimeException(e); 73 | } 74 | } 75 | 76 | public static void parseRobotsTxt(String userAgent, String robotsUrl, String robotsTxt, HtmlAnalysisResult result) { 77 | result.setRobotsTxt(robotsTxt); 78 | SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); 79 | BaseRobotRules robotRules = robotsParser.parseContent(robotsUrl, robotsTxt.getBytes(), null, userAgent); 80 | result.setRobotsAllowedAll(robotRules.isAllowAll()); 81 | result.setRobotsAllowedNone(robotRules.isAllowNone()); 82 | result.setRobotsAllowedHome(robotRules.isAllowed("/")); 83 | result.setRobotsSitemaps(robotRules.getSitemaps()); 84 | result.setRobotsCrawlDelay(robotRules.getCrawlDelay()); 85 | } 86 | 87 | private static String robotsTxtUrl(String url) { 88 | try { 89 | URL urlObject = new URL(url); 90 | String portPart = urlObject.getPort() > 0 ? ":" + urlObject.getPort() : ""; 91 | return String.format("%s://%s%s/robots.txt", urlObject.getProtocol(), 92 | urlObject.getHost(), portPart); 93 | } catch (Exception e) { 94 | throw new RuntimeException(e); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /page-analyzer/src/test/java/lt/tokenmill/crawling/pageanalyzer/PageAnalyzerTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.pageanalyzer; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.collect.Lists; 5 | import com.google.common.collect.Maps; 6 | import com.google.common.io.Resources; 7 | import lt.tokenmill.crawling.data.HtmlAnalysisResult; 8 | import org.junit.Ignore; 9 | import org.junit.Test; 10 | 11 | import java.net.URL; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.assertFalse; 18 | import static org.junit.Assert.assertTrue; 19 | 20 | public class PageAnalyzerTest { 21 | 22 | @Test 23 | public void headersAndStatus() { 24 | Map> headers = Maps.newHashMap(); 25 | headers.put("Etag", Lists.newArrayList("c1dc8d7be85325149", "ed5fc4d62b84752")); 26 | headers.put("Date", Lists.newArrayList("Wed, 11 Jan 2017 13:00:18 GMT")); 27 | HashMap config = Maps.newHashMap(); 28 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://example.org", "", 200, headers); 29 | 30 | assertEquals(new Integer(200), result.getHttpStatus()); 31 | assertEquals(2, result.getHeaders().size()); 32 | assertEquals("c1dc8d7be85325149\ned5fc4d62b84752", result.getHeaders().get("Etag")); 33 | assertEquals("Wed, 11 Jan 2017 13:00:18 GMT", result.getHeaders().get("Date")); 34 | } 35 | 36 | 37 | @Test 38 | public void htmlParsing() { 39 | HashMap config = Maps.newHashMap(); 40 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://bloomberg.com/", loadHtml("bloomberg.com"), 200, Maps.newHashMap()); 41 | assertEquals("Bloomberg.com", result.getTitle()); 42 | assertEquals(33, result.getMetaValues().size()); 43 | assertTrue(result.getMetaValues().contains("")); 44 | assertEquals(361, result.getLinks().size()); 45 | assertTrue(result.getLinks().contains("https://www.bloomberg.com/news/articles/2017-01-10/netanyahu-s-grip-on-power-under-threat-as-gift-scandal-escalates")); 46 | } 47 | 48 | @Test 49 | @Ignore 50 | public void fetchAndParse() { 51 | HashMap config = Maps.newHashMap(); 52 | config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true"); 53 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "http://www.tokenmill.lt/"); 54 | assertEquals("TokenMill - Natural Language Processing", result.getTitle()); 55 | assertEquals(10, result.getMetaValues().size()); 56 | assertEquals(42, result.getLinks().size()); 57 | assertTrue(result.getLinks().contains("http://www.tokenmill.lt/#case-monitoring")); 58 | assertTrue(result.getRobotsAllowedAll()); 59 | assertFalse(result.getRobotsAllowedNone()); 60 | assertTrue(result.getRobotsAllowedHome()); 61 | assertEquals(Lists.newArrayList(), result.getRobotsSitemaps()); 62 | assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay()); 63 | 64 | } 65 | 66 | @Test 67 | @Ignore 68 | public void fetchAndParseRobotsTxt() { 69 | HashMap config = Maps.newHashMap(); 70 | config.put(PageAnalyzer.CONFIG_ANALYZE_ROBOTS_TXT, "true"); 71 | HtmlAnalysisResult result = PageAnalyzer.analyze(config, "https://www.google.com"); 72 | assertFalse(result.getRobotsAllowedAll()); 73 | assertFalse(result.getRobotsAllowedNone()); 74 | assertTrue(result.getRobotsAllowedHome()); 75 | assertTrue(result.getRobotsSitemaps().contains("http://www.gstatic.com/culturalinstitute/sitemaps/www_google_com_culturalinstitute/sitemap-index.xml")); 76 | assertEquals(Long.MIN_VALUE, (long) result.getRobotsCrawlDelay()); 77 | 78 | } 79 | 80 | private static String loadHtml(String name) { 81 | try { 82 | URL htmlResource = Resources.getResource(name + ".html"); 83 | return Resources.toString(htmlResource, Charsets.UTF_8); 84 | } catch (Exception e) { 85 | throw new RuntimeException(e); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /parser/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | crawling-framework 7 | lt.tokenmill.crawling 8 | 0.3.4-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | parser 13 | 14 | 15 | 16 | lt.tokenmill.crawling 17 | data-model 18 | ${project.version} 19 | 20 | 21 | org.jsoup 22 | jsoup 23 | 24 | 25 | com.github.jsonld-java 26 | jsonld-java 27 | 28 | 29 | com.google.guava 30 | guava 31 | 32 | 33 | org.apache.commons 34 | commons-lang3 35 | 3.5 36 | 37 | 38 | org.clojure 39 | clojure 40 | 1.7.0 41 | 42 | 43 | lt.tokenmill 44 | timewords 45 | ${timewords.version} 46 | 47 | 48 | org.slf4j 49 | slf4j-log4j12 50 | ${slf4j.version} 51 | provided 52 | 53 | 54 | junit 55 | junit 56 | 4.13.1 57 | test 58 | 59 | 60 | 61 | 62 | 63 | release 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-source-plugin 69 | 70 | 71 | 72 | org.apache.maven.plugins 73 | maven-jar-plugin 74 | 75 | 76 | 77 | org.apache.maven.plugins 78 | maven-javadoc-plugin 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/PageAnalyzer.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | public class PageAnalyzer { 4 | 5 | 6 | 7 | } 8 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/TitleParser.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | 4 | import com.google.common.base.Strings; 5 | import com.google.common.collect.Lists; 6 | import com.google.common.collect.Maps; 7 | import lt.tokenmill.crawling.parser.data.MatchedString; 8 | import org.jsoup.nodes.Document; 9 | 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.stream.Collectors; 13 | 14 | public class TitleParser { 15 | 16 | private static final List TITLE_META_KEYS = Lists.newArrayList("og:title"); 17 | 18 | public static List extractFromMeta(Document document) { 19 | String itempropValue = document.select("[itemprop*=headline]").text(); 20 | if (itempropValue != null && !itempropValue.trim().isEmpty()) { 21 | return Lists.newArrayList(new MatchedString(itempropValue, "[itemprop*=headline]")); 22 | } 23 | Map metaValues = Maps.newHashMap(); 24 | document.select("meta").forEach(m -> { 25 | String name = m.attr("name"); 26 | String property = m.attr("property"); 27 | String content = m.attr("content"); 28 | if (!Strings.isNullOrEmpty(name)) { 29 | metaValues.put(name.toLowerCase(), content); 30 | } else if (!Strings.isNullOrEmpty(property)) { 31 | metaValues.put(property.toLowerCase(), content); 32 | } 33 | }); 34 | return TITLE_META_KEYS.stream() 35 | .filter(k -> metaValues.get(k) != null) 36 | .map(k -> new MatchedString(metaValues.get(k), "META:" + k)) 37 | .collect(Collectors.toList()); 38 | } 39 | } -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedDate.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.data; 2 | 3 | import org.joda.time.DateTime; 4 | 5 | import java.util.Objects; 6 | 7 | public class MatchedDate { 8 | 9 | private String value; 10 | 11 | private String match; 12 | 13 | private String pattern; 14 | 15 | private DateTime date; 16 | 17 | public MatchedDate(String value, String match) { 18 | this.value = value; 19 | this.match = match; 20 | } 21 | 22 | public String getValue() { 23 | return value; 24 | } 25 | 26 | public void setValue(String value) { 27 | this.value = value; 28 | } 29 | 30 | public String getMatch() { 31 | return match; 32 | } 33 | 34 | public void setMatch(String match) { 35 | this.match = match; 36 | } 37 | 38 | public DateTime getDate() { 39 | return date; 40 | } 41 | 42 | public void setDate(DateTime date) { 43 | this.date = date; 44 | } 45 | 46 | public String getPattern() { 47 | return pattern; 48 | } 49 | 50 | public void setPattern(String pattern) { 51 | this.pattern = pattern; 52 | } 53 | 54 | @Override 55 | public String toString() { 56 | return "MatchedDate{" + 57 | "value='" + value + '\'' + 58 | ", match='" + match + '\'' + 59 | ", pattern='" + pattern + '\'' + 60 | ", date=" + date + 61 | '}'; 62 | } 63 | 64 | @Override 65 | public boolean equals(Object o) { 66 | if (this == o) return true; 67 | if (o == null || getClass() != o.getClass()) return false; 68 | MatchedDate that = (MatchedDate) o; 69 | return Objects.equals(value, that.value); 70 | } 71 | 72 | @Override 73 | public int hashCode() { 74 | return Objects.hash(value); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/data/MatchedString.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.data; 2 | 3 | import java.util.Objects; 4 | 5 | public class MatchedString { 6 | 7 | private String value; 8 | 9 | private String match; 10 | 11 | public MatchedString(String value, String match) { 12 | this.value = value; 13 | this.match = match; 14 | } 15 | 16 | public String getValue() { 17 | return value; 18 | } 19 | 20 | public void setValue(String value) { 21 | this.value = value; 22 | } 23 | 24 | public String getMatch() { 25 | return match; 26 | } 27 | 28 | public void setMatch(String match) { 29 | this.match = match; 30 | } 31 | 32 | @Override 33 | public boolean equals(Object o) { 34 | if (this == o) return true; 35 | if (o == null || getClass() != o.getClass()) return false; 36 | MatchedString that = (MatchedString) o; 37 | return Objects.equals(value, that.value); 38 | } 39 | 40 | @Override 41 | public int hashCode() { 42 | return Objects.hash(value); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/urls/UrlExtractor.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.urls; 2 | 3 | import org.jsoup.nodes.Document; 4 | import org.jsoup.select.Elements; 5 | 6 | import java.net.URI; 7 | import java.net.URISyntaxException; 8 | import java.util.HashSet; 9 | import java.util.Set; 10 | import java.util.stream.Collectors; 11 | 12 | public class UrlExtractor { 13 | 14 | private static boolean isAbsolute(String url) { 15 | try { 16 | URI uri = new URI(url); 17 | return uri.isAbsolute(); 18 | } catch (URISyntaxException e) { 19 | e.printStackTrace(); 20 | return false; 21 | } 22 | } 23 | 24 | private static Set extract(Document document) { 25 | Set canonicalUrls = new HashSet<>(); 26 | if (document == null) { 27 | return canonicalUrls; 28 | } 29 | 30 | Elements elements = document.select("meta[property=og:url]"); 31 | elements.forEach(element -> { 32 | String attr = element.attr("content"); 33 | if (attr != null) { 34 | canonicalUrls.add(attr); 35 | } 36 | }); 37 | 38 | elements = document.select("link[rel=canonical]"); 39 | elements.forEach(element -> { 40 | String attr = element.attr("href"); 41 | if (attr != null) { 42 | canonicalUrls.add(attr); 43 | } 44 | }); 45 | 46 | return canonicalUrls.stream() 47 | .filter(UrlExtractor::isAbsolute) 48 | .collect(Collectors.toSet()); 49 | } 50 | 51 | public static String extract(String url, Document document) { 52 | Set canonicalUrls = extract(document); 53 | if (canonicalUrls == null) { 54 | return url; 55 | } else { 56 | return canonicalUrls.stream().findFirst().orElse(url); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/utils/HttpSourceTester.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.utils; 2 | 3 | import com.google.common.collect.Maps; 4 | import lt.tokenmill.crawling.data.*; 5 | import lt.tokenmill.crawling.parser.ArticleExtractor; 6 | import lt.tokenmill.crawling.parser.urls.UrlFilters; 7 | 8 | import java.util.Map; 9 | 10 | import static com.google.common.base.Strings.nullToEmpty; 11 | 12 | public class HttpSourceTester { 13 | 14 | public static final String URL_ACCEPTED = "url_accepted"; 15 | public static final String TITLE = "title"; 16 | public static final String TEXT = "text"; 17 | public static final String DATE = "date"; 18 | 19 | public static Map test(HttpSource source, HttpSourceTest data) { 20 | TestResult result = new TestResult(); 21 | 22 | String url = data.getUrl(); 23 | UrlFilters urlFilters = UrlFilters.create(source.getUrlNormalizers(), source.getUrlFilters()); 24 | UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url); 25 | result.acceptedUrl(filteringResult.getAccepted(), data.getUrlAccepted()); 26 | 27 | String html = nullToEmpty(data.getHtml()).trim(); 28 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, source, null); 29 | HttpArticle article = parseResult.getArticle(); 30 | result.title(nullToEmpty(article.getTitle()), nullToEmpty(data.getTitle())); 31 | result.text(nullToEmpty(article.getText()), nullToEmpty(data.getText())); 32 | result.date(article.getPublished() != null ? DataUtils.formatInUTC(article.getPublished()) : "", nullToEmpty(data.getDate())); 33 | 34 | return result.difference(); 35 | } 36 | 37 | public static class Difference { 38 | 39 | private String actual; 40 | 41 | private String expected; 42 | 43 | public Difference(String actual, String expected) { 44 | this.actual = actual; 45 | this.expected = expected; 46 | } 47 | 48 | public String getActual() { 49 | return actual; 50 | } 51 | 52 | public String getExpected() { 53 | return expected; 54 | } 55 | 56 | @Override 57 | public String toString() { 58 | return "Difference{" + 59 | "actual='" + actual + '\'' + 60 | ", expected='" + expected + '\'' + 61 | '}'; 62 | } 63 | } 64 | 65 | public static class TestResult { 66 | 67 | private boolean expectedUrlAccepted; 68 | private boolean actualUrlAccepted; 69 | private String expectedTitle; 70 | private String actualTitle; 71 | private String expectedText; 72 | private String actualText; 73 | private String expectedDate; 74 | private String actualDate; 75 | 76 | void acceptedUrl(boolean actual, boolean expected) { 77 | this.expectedUrlAccepted = expected; 78 | this.actualUrlAccepted = actual; 79 | } 80 | 81 | public void title(String actual, String expected) { 82 | this.expectedTitle = expected.trim(); 83 | this.actualTitle = actual.trim(); 84 | } 85 | 86 | public void text(String actual, String expected) { 87 | this.expectedText = expected.trim(); 88 | this.actualText = actual.trim(); 89 | } 90 | 91 | public void date(String actual, String expected) { 92 | this.expectedDate = expected.trim(); 93 | this.actualDate = actual.trim(); 94 | } 95 | 96 | public Map difference() { 97 | Map result = Maps.newLinkedHashMap(); 98 | if (expectedUrlAccepted != actualUrlAccepted) { 99 | result.put(URL_ACCEPTED, 100 | new Difference(String.valueOf(actualUrlAccepted), String.valueOf(expectedUrlAccepted))); 101 | } 102 | if (!expectedTitle.equals(actualTitle)) { 103 | result.put(TITLE, new Difference(actualTitle, expectedTitle)); 104 | } 105 | if (!expectedText.equals(actualText)) { 106 | result.put(TEXT, new Difference(actualText, expectedText)); 107 | } 108 | if (!expectedDate.equals(actualDate)) { 109 | result.put(DATE, new Difference(actualDate, expectedDate)); 110 | } 111 | return result; 112 | } 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/utils/QueryParser.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.utils; 2 | 3 | import com.google.common.base.Strings; 4 | import com.google.common.collect.Lists; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.util.stream.Collectors; 9 | 10 | public class QueryParser { 11 | 12 | public static List parseQuery(String query) { 13 | List result = Lists.newArrayList(); 14 | if (!Strings.isNullOrEmpty(query)) { 15 | query = query.replaceAll("(\\s*[+-]\\s*)", "#SPLIT#$1"); 16 | return Arrays.stream(query.split("(#SPLIT#| )")) 17 | .map(String::trim) 18 | .filter(s -> !s.isEmpty()) 19 | .collect(Collectors.toList()); 20 | } 21 | return result; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /parser/src/main/java/lt/tokenmill/crawling/parser/utils/TextFilters.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.utils; 2 | 3 | import java.util.List; 4 | import java.util.Objects; 5 | import java.util.regex.Pattern; 6 | 7 | public class TextFilters { 8 | 9 | // Normalizer is of format [match regexp]-->>[replacement string] 10 | // Normalizers that don't match the format are ignored 11 | // [match regexp]s that don't compile are ignored 12 | // String t can be null. 13 | // if textNormalizers is null then t is returned. 14 | public static String normalizeText(String t, List textNormalizers) { 15 | t = Objects.toString(t, ""); 16 | if (textNormalizers == null) 17 | return t; 18 | return textNormalizers.stream() 19 | .filter(tn -> tn.contains("-->>")) 20 | .reduce(t, (a, tn) -> { 21 | String[] parts = tn.split("-->>"); 22 | String match = parts[0]; 23 | try { 24 | Pattern.compile(match); 25 | } catch (Exception e) { 26 | return a; 27 | } 28 | String replacement = parts.length > 1 ? parts[1] : ""; 29 | return a.replaceAll(match, replacement); 30 | }).trim(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/AljazeeraExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import lt.tokenmill.crawling.data.HttpArticle; 4 | import lt.tokenmill.crawling.data.HttpSource; 5 | import org.junit.Test; 6 | 7 | import java.util.Arrays; 8 | 9 | import static junit.framework.TestCase.assertEquals; 10 | 11 | public class AljazeeraExtractorTest extends BaseArticleExtractorTest { 12 | 13 | @Test 14 | public void testFortune2() throws Exception { 15 | String html = loadArticle("aljazeera1"); 16 | String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html"; 17 | HttpArticle article = ArticleExtractor.extractArticle(html, url, getSourceConf(), null); 18 | assertEquals("2018-05-13T00:00:00.000Z", article.getPublished().toInstant().toString()); 19 | } 20 | 21 | private HttpSource getSourceConf() { 22 | HttpSource source = new HttpSource(); 23 | source.setDateSelectors(Arrays.asList(".article-duration")); 24 | return source; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/BaseArticleExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Resources; 5 | 6 | import java.net.URL; 7 | 8 | public abstract class BaseArticleExtractorTest { 9 | 10 | protected String loadArticle(String name) throws Exception { 11 | URL htmlResource = Resources.getResource("articles/" + name + ".html"); 12 | return Resources.toString(htmlResource, Charsets.UTF_8); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/BloombergExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import lt.tokenmill.crawling.data.HttpArticle; 4 | import lt.tokenmill.crawling.data.HttpArticleParseResult; 5 | import lt.tokenmill.crawling.data.HttpSource; 6 | import org.joda.time.DateTime; 7 | import org.joda.time.DateTimeZone; 8 | import org.junit.Test; 9 | 10 | import static org.junit.Assert.assertEquals; 11 | import static org.junit.Assert.assertTrue; 12 | 13 | public class BloombergExtractorTest extends BaseArticleExtractorTest { 14 | 15 | 16 | @Test 17 | public void testBloomberg1() throws Exception { 18 | String html = loadArticle("bloomberg1"); 19 | String url = "http://www.bloomberg.com/news/articles/2016-09-08/japan-index-futures-signal-bounce-as-ecb-outlook-weighs-on-bonds"; 20 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, bloombergSource(), null); 21 | HttpArticle article = parseResult.getArticle(); 22 | assertEquals("Stocks Sink With Bonds, Dollar Rallies as Complacency Broken", article.getTitle()); 23 | assertTrue(article.getText().contains("erted declines of this size in stocks and bonds are rare though not ")); 24 | assertTrue(article.getText().startsWith("Tranquility that has enveloped global")); 25 | assertEquals(parseResult.getPublishedMatches().get(0), "META:parsely-pub-date"); 26 | DateTime actualPublished = article.getPublished(); 27 | DateTime expectedPublished = new DateTime(2016, 9, 8, 23, 14, 29, 36, DateTimeZone.UTC); 28 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate())); 29 | } 30 | 31 | private HttpSource bloombergSource() { 32 | HttpSource source = new HttpSource(); 33 | return source; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/CyberscoopExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import lt.tokenmill.crawling.data.HttpArticleParseResult; 4 | import lt.tokenmill.crawling.data.HttpSource; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.junit.Test; 8 | 9 | import java.util.Arrays; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | 13 | public class CyberscoopExtractorTest extends BaseArticleExtractorTest { 14 | 15 | private static final String TITLE_SELECTOR = "h1.article__title"; 16 | 17 | private HttpSource cyberscoopSourceWithoutTitleSelector() { 18 | HttpSource source = new HttpSource(); 19 | return source; 20 | } 21 | 22 | private HttpSource cyberscoopSourceWithTitleSelector() { 23 | HttpSource source = new HttpSource(); 24 | source.setTitleSelectors(Arrays.asList(TITLE_SELECTOR)); 25 | return source; 26 | } 27 | 28 | @Test 29 | public void testTitleExtraction000() throws Exception { 30 | String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/"; 31 | String html = loadArticle("cyberscoop1"); 32 | Document document = Jsoup.parse(html, url); 33 | HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithoutTitleSelector(), null); 34 | assertEquals(1, article.getTitleMatches().size()); 35 | assertEquals("META:og:title", article.getTitleMatches().get(0)); 36 | } 37 | 38 | @Test 39 | public void testTitleExtraction001() throws Exception { 40 | String url = "https://www.cyberscoop.com/u-s-oil-gas-companies-still-trying-catch-cybersecurity-experts-say/"; 41 | String html = loadArticle("cyberscoop1"); 42 | Document document = Jsoup.parse(html, url); 43 | HttpArticleParseResult article = ArticleExtractor.extractArticleWithDetails(html, url, cyberscoopSourceWithTitleSelector(), null); 44 | assertEquals(1, article.getTitleMatches().size()); 45 | assertEquals(TITLE_SELECTOR, article.getTitleMatches().get(0)); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/FortuneExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import lt.tokenmill.crawling.data.HttpArticle; 4 | import lt.tokenmill.crawling.data.HttpSource; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.TestCase.assertEquals; 8 | 9 | public class FortuneExtractorTest extends BaseArticleExtractorTest { 10 | 11 | @Test 12 | public void testFortune1() throws Exception { 13 | String html = loadArticle("fortune1"); 14 | String url = "http://fortune.com/2017/04/13/susan-fowler-uber-editor-stripe/"; 15 | HttpArticle article = ArticleExtractor.extractArticle(html, url, fortuneSource(), "2017/04/13"); 16 | assertEquals("2017-04-13T00:00:00.000Z", article.getPublished().toInstant().toString()); 17 | } 18 | 19 | private HttpSource fortuneSource() { 20 | HttpSource source = new HttpSource(); 21 | return source; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/InvestingParserTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import com.google.common.collect.Lists; 4 | import lt.tokenmill.crawling.data.HttpArticle; 5 | import lt.tokenmill.crawling.data.HttpArticleParseResult; 6 | import lt.tokenmill.crawling.data.HttpSource; 7 | import org.joda.time.DateTime; 8 | import org.joda.time.DateTimeZone; 9 | import org.junit.Test; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | import static org.junit.Assert.assertTrue; 13 | 14 | public class InvestingParserTest extends BaseArticleExtractorTest { 15 | 16 | 17 | @Test 18 | public void testInvesting1() throws Exception { 19 | String html = loadArticle("investing1"); 20 | String url = "https://www.investing.com/analysis/opening-bell:-brexit,-davos-meetings-are-today%E2%80%99s-big-drivers-200172664"; 21 | HttpArticleParseResult result = ArticleExtractor.extractArticleWithDetails(html, url, investingSource(), null); 22 | HttpArticle article = result.getArticle(); 23 | assertEquals("Opening Bell: USD Drops, Pound Pops, Yen Soars", article.getTitle()); 24 | assertTrue(article.getText().startsWith("by Eli Wright\nAs markets in the US return from the long holiday weekend")); 25 | assertTrue(article.getText().endsWith("ab Corporation (NYSE:SCHW) expects EPS of $0.36.")); 26 | DateTime actualPublished = article.getPublished(); 27 | DateTime expectedPublished = new DateTime(2017, 1, 17, 11, 8, 00, 00, DateTimeZone.UTC); 28 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate())); 29 | } 30 | 31 | private HttpSource investingSource() { 32 | HttpSource source = new HttpSource(); 33 | source.setTextSelectors(Lists.newArrayList("#contentSection p, #contentSection li")); 34 | source.setDateSelectors(Lists.newArrayList(".contentSectionDetails span")); 35 | source.setDateRegexps(Lists.newArrayList(".*\\((.+)\\).*")); 36 | return source; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/KedainietisTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import lt.tokenmill.crawling.data.HttpArticle; 4 | import lt.tokenmill.crawling.data.HttpArticleParseResult; 5 | import lt.tokenmill.crawling.data.HttpSource; 6 | import org.junit.Test; 7 | 8 | import java.util.Arrays; 9 | 10 | import static org.junit.Assert.assertEquals; 11 | import static org.junit.Assert.assertNotNull; 12 | import static org.junit.Assert.assertTrue; 13 | 14 | public class KedainietisTest extends BaseArticleExtractorTest{ 15 | 16 | private HttpSource kedainietisSource() { 17 | HttpSource source = new HttpSource(); 18 | source.setLanguage("lt"); 19 | source.setDateSelectors(Arrays.asList("span.dtreviewed")); 20 | return source; 21 | } 22 | 23 | @Test 24 | public void testKedainietis() throws Exception { 25 | String html = loadArticle("kedainietis"); 26 | String url = "http://www.kedainietis.lt/naujienos/naujienos/nedeklaravus-gyvenamosios-vietos-nepasieks-ir-sodros-mokami-alimentai-17694/"; 27 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, kedainietisSource(), null); 28 | HttpArticle article = parseResult.getArticle(); 29 | assertEquals("Nedeklaravus gyvenamosios vietos, nepasieks ir „Sodros“ mokami alimentai".trim(), article.getTitle().trim()); 30 | assertTrue(article.getText().contains("valstybės biudžeto Lietuvoje")); 31 | assertTrue(article.getText().startsWith("Iš valstybės")); 32 | assertEquals(parseResult.getPublishedMatches().get(0), "span.dtreviewed"); 33 | assertNotNull(article.getPublished()); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/ReutersExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser; 2 | 3 | import com.google.common.collect.Lists; 4 | import lt.tokenmill.crawling.data.HttpArticle; 5 | import lt.tokenmill.crawling.data.HttpArticleParseResult; 6 | import lt.tokenmill.crawling.data.HttpSource; 7 | import org.joda.time.DateTime; 8 | import org.joda.time.DateTimeZone; 9 | import org.junit.Test; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | import static org.junit.Assert.assertNull; 13 | import static org.junit.Assert.assertTrue; 14 | 15 | public class ReutersExtractorTest extends BaseArticleExtractorTest { 16 | 17 | 18 | @Test 19 | public void testReuters1() throws Exception { 20 | String html = loadArticle("reuters1"); 21 | String url = "http://www.reuters.com/finance/stocks/TEX/key-developments/article/3414284"; 22 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null); 23 | assertEquals("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring- CNBC, citing source", article.getTitle()); 24 | assertTrue(article.getText().contains("Marcato reports 5.1 pct stake in Terex, to urge spinoff & restructuring; Marcato supports Terex CEO - CNBC, citing source")); 25 | DateTime actualPublished = article.getPublished(); 26 | DateTime expectedPublished = new DateTime(2016, 7, 28, 15, 35, DateTimeZone.UTC); 27 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate())); 28 | } 29 | 30 | @Test 31 | public void testReuters2() throws Exception { 32 | String html = loadArticle("reuters2"); 33 | String url = "http://www.reuters.com/article/idUSFWN1B40B5"; 34 | HttpArticleParseResult parseResult = ArticleExtractor.extractArticleWithDetails(html, url, reutersSource(), null); 35 | HttpArticle article = parseResult.getArticle(); 36 | assertEquals("BRIEF-Canadian Solar unit Recurrent Energy reached commercial operation of 100 MWac/134 MWp", article.getTitle()); 37 | assertTrue(article.getText().contains("Unit Recurrent Energy has reached commercial operation of 100 MWac/134 MWp Mustang solar power project")); 38 | assertEquals("LD+JSON", parseResult.getPublishedMatches().get(0)); 39 | DateTime expectedPublished = new DateTime(2016, 8, 23, 12, 24, 3, DateTimeZone.UTC); 40 | DateTime actualPublished = article.getPublished(); 41 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate())); 42 | } 43 | 44 | @Test 45 | public void testReuters3() throws Exception { 46 | String html = loadArticle("reuters3"); 47 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2"; 48 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersSource(), null); 49 | assertEquals("Tesla touts speed and driving range with new upgraded battery", article.getTitle()); 50 | assertTrue(article.getText().contains(" models. But Musk said those were both millio")); 51 | DateTime expectedPublished = new DateTime(2016, 8, 23, 22, 41, 57, DateTimeZone.UTC); 52 | DateTime actualPublished = article.getPublished(); 53 | assertTrue(actualPublished.toDate().equals(expectedPublished.toDate())); 54 | } 55 | 56 | @Test 57 | public void testReutersBlog1() throws Exception { 58 | String html = loadArticle("reuters-blogs1"); 59 | String url = "http://blogs.reuters.com/breakingviews/2016/08/22/pfizer-bets-14-bln-it-knows-better-than-market/"; 60 | HttpArticle article = ArticleExtractor.extractArticle(html, url, reutersBlogsSource(), null); 61 | assertEquals("Pfizer bets $14 bln it knows better than market", article.getTitle()); 62 | assertTrue(article.getText().contains("r may believe in a far more lucrative outcom")); 63 | DateTime actualPublished = article.getPublished(); 64 | assertNull(actualPublished); 65 | } 66 | 67 | 68 | private HttpSource reutersSource() { 69 | HttpSource source = new HttpSource(); 70 | source.setTitleSelectors(Lists.newArrayList("h1")); 71 | source.setDateSelectors(Lists.newArrayList("#sigDevArticleText .timestamp")); 72 | source.setTextSelectors(Lists.newArrayList("#article-text p")); 73 | return source; 74 | } 75 | 76 | private HttpSource reutersBlogsSource() { 77 | HttpSource source = new HttpSource(); 78 | source.setTextSelectors(Lists.newArrayList("#postcontent p")); 79 | return source; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlExtractorTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.urls; 2 | 3 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class UrlExtractorTest extends BaseArticleExtractorTest { 11 | 12 | @Test 13 | public void testExtraction00() throws Exception { 14 | String html = loadArticle("aljazeera1"); 15 | String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html"; 16 | Document document = Jsoup.parse(html); 17 | assertEquals(url, UrlExtractor.extract(url, document)); 18 | assertEquals("https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html", UrlExtractor.extract("", document)); 19 | } 20 | 21 | @Test 22 | public void testExtraction01() throws Exception { 23 | String html = loadArticle("kedainietis"); 24 | String url = "url"; 25 | Document document = Jsoup.parse(html); 26 | assertEquals(url, UrlExtractor.extract(url, document)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/urls/UrlFiltersTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.urls; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.Arrays; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | 9 | public class UrlFiltersTest { 10 | 11 | @Test 12 | public void testURLNormalizer000() { 13 | UrlFilters urlFilters = UrlFilters.create(Arrays.asList("a-->>b"), Arrays.asList()); 14 | assertEquals("bbbb", urlFilters.filterWithDetails("aaaa").getNormalized()); 15 | assertEquals("bbbb", urlFilters.filterWithDetails("abba").getNormalized()); 16 | 17 | urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList()); 18 | String url = "http://www.tokenmill.lt/#case-understand"; 19 | assertEquals("http://www.tokenmill.lt/", urlFilters.filterWithDetails(url).getNormalized()); 20 | } 21 | 22 | @Test 23 | public void testURLFilters000() { 24 | String url = "http://www.tokenmill.lt/#case-understand"; 25 | UrlFilters urlFilters = UrlFilters.create(Arrays.asList("#.*-->>"), Arrays.asList("+^http://www.tokenmill.lt/.*", "-.*apache.*")); 26 | UrlFilters.FilteringResult filteringResult = urlFilters.filterWithDetails(url); 27 | assertEquals(true, filteringResult.getAccepted()); 28 | assertEquals("+^http://www.tokenmill.lt/.*", filteringResult.getFilter()); 29 | assertEquals(1, filteringResult.getNormalizers().size()); 30 | assertEquals("http://www.tokenmill.lt/", filteringResult.getNormalized()); 31 | 32 | assertEquals("http://www.tokenmill.lt/", urlFilters.filter(url)); 33 | assertEquals(null, urlFilters.filter("http://nutch.apache.org/")); 34 | 35 | filteringResult = urlFilters.filterWithDetails("http://nutch.apache.org/"); 36 | assertEquals(false, filteringResult.getAccepted()); 37 | assertEquals("-.*apache.*", filteringResult.getFilter()); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/utils/HttpSourceTesterTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.utils; 2 | 3 | import com.google.common.collect.Lists; 4 | import lt.tokenmill.crawling.data.DataUtils; 5 | import lt.tokenmill.crawling.data.HttpArticle; 6 | import lt.tokenmill.crawling.data.HttpSource; 7 | import lt.tokenmill.crawling.data.HttpSourceTest; 8 | import lt.tokenmill.crawling.parser.ArticleExtractor; 9 | import lt.tokenmill.crawling.parser.BaseArticleExtractorTest; 10 | import org.junit.Test; 11 | 12 | import java.util.Map; 13 | 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class HttpSourceTesterTest extends BaseArticleExtractorTest { 17 | 18 | @Test 19 | public void exactMatch() throws Exception { 20 | String html = loadArticle("reuters3"); 21 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2"; 22 | 23 | HttpSource source = new HttpSource(); 24 | source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$")); 25 | 26 | HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null); 27 | 28 | HttpSourceTest sourceTest = new HttpSourceTest(); 29 | sourceTest.setHtml(html); 30 | sourceTest.setUrl(url); 31 | sourceTest.setUrlAccepted(true); 32 | sourceTest.setTitle(article.getTitle()); 33 | sourceTest.setDate(DataUtils.formatInUTC(article.getPublished())); 34 | sourceTest.setText(article.getText()); 35 | 36 | Map differences = HttpSourceTester.test(source, sourceTest); 37 | assertEquals(0, differences.size()); 38 | } 39 | 40 | @Test 41 | public void allDifferent() throws Exception { 42 | String html = loadArticle("reuters3"); 43 | String url = "http://www.reuters.com/article/us-tesla-product-idUSKCN10Y1R2"; 44 | 45 | HttpSource source = new HttpSource(); 46 | source.setUrlFilters(Lists.newArrayList("+https?://www.reuters.com/.+$")); 47 | 48 | HttpArticle article = ArticleExtractor.extractArticle(html, url, source, null); 49 | 50 | HttpSourceTest sourceTest = new HttpSourceTest(); 51 | sourceTest.setHtml(html); 52 | sourceTest.setUrl(url); 53 | sourceTest.setUrlAccepted(false); 54 | sourceTest.setTitle("Title"); 55 | sourceTest.setDate("Published"); 56 | sourceTest.setText("Text"); 57 | 58 | Map differences = HttpSourceTester.test(source, sourceTest); 59 | assertEquals(4, differences.size()); 60 | assertEquals("false", differences.get(HttpSourceTester.URL_ACCEPTED).getExpected()); 61 | assertEquals("true", differences.get(HttpSourceTester.URL_ACCEPTED).getActual()); 62 | assertEquals("Title", differences.get(HttpSourceTester.TITLE).getExpected()); 63 | assertEquals(article.getTitle(), differences.get(HttpSourceTester.TITLE).getActual()); 64 | assertEquals("Published", differences.get(HttpSourceTester.DATE).getExpected()); 65 | assertEquals(DataUtils.formatInUTC(article.getPublished()), differences.get(HttpSourceTester.DATE).getActual()); 66 | assertEquals("Text", differences.get(HttpSourceTester.TEXT).getExpected()); 67 | assertEquals(article.getText(), differences.get(HttpSourceTester.TEXT).getActual()); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /parser/src/test/java/lt/tokenmill/crawling/parser/utils/QueryParserTest.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.parser.utils; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.junit.Test; 5 | 6 | import java.util.List; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class QueryParserTest { 11 | 12 | @Test 13 | public void parseQuery() { 14 | List parts = QueryParser.parseQuery("+Turkey-Inflation"); 15 | assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts); 16 | 17 | parts = QueryParser.parseQuery("+Turkey -Inflation"); 18 | assertEquals(Lists.newArrayList("+Turkey", "-Inflation"), parts); 19 | 20 | parts = QueryParser.parseQuery("Turkey -Inflation"); 21 | assertEquals(Lists.newArrayList("Turkey", "-Inflation"), parts); 22 | 23 | parts = QueryParser.parseQuery("+Turkey attack"); 24 | assertEquals(Lists.newArrayList("+Turkey", "attack"), parts); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /parser/src/test/resources/jsonld/bbc-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "http:\/\/schema.org", 3 | "@type": "Article", 4 | "url": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745", 5 | "publisher": { 6 | "@type": "Organization", 7 | "name": "BBC News", 8 | "logo": { 9 | "@type": "ImageObject", 10 | "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1" 11 | } 12 | }, 13 | "datePublished": "2017-08-30T10:32:11+01:00", 14 | "dateModified": "2017-08-30T10:32:11+01:00", 15 | "headline": "Venezuela: New assembly approves treason trials for opposition", 16 | "image": { 17 | "@type": "ImageObject", 18 | "width": 720, 19 | "height": 405, 20 | "url": "https:\/\/ichef-1.bbci.co.uk\/news\/720\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg" 21 | }, 22 | "thumbnailUrl": "https:\/\/ichef.bbci.co.uk\/news\/208\/cpsprodpb\/11EF3\/production\/_97595437_mediaitem97595433.jpg", 23 | "author": { 24 | "@type": "Organization", 25 | "name": "BBC News", 26 | "logo": { 27 | "@type": "ImageObject", 28 | "url": "http:\/\/www.bbc.co.uk\/news\/special\/2015\/newsspec_10857\/bbc_news_logo.png?cb=1" 29 | } 30 | }, 31 | "mainEntityOfPage": "http:\/\/www.bbc.com\/news\/world-latin-america-41091745" 32 | } -------------------------------------------------------------------------------- /ui-commons/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | crawling-framework 7 | lt.tokenmill.crawling 8 | 0.3.4-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | ui-commons 13 | 14 | 15 | 16 | lt.tokenmill.crawling 17 | elasticsearch 18 | 19 | 20 | lt.tokenmill.crawling 21 | parser 22 | 23 | 24 | 25 | 26 | 27 | release 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-source-plugin 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-jar-plugin 38 | 39 | 40 | 41 | org.apache.maven.plugins 42 | maven-javadoc-plugin 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /ui-commons/src/main/java/lt/tokenmill/crawling/commonui/Configuration.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.commonui; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.util.Properties; 7 | 8 | public class Configuration { 9 | 10 | public static final Configuration INSTANCE = new Configuration(); 11 | 12 | private static final String DEFAULT_CONFIG_FILE_LOCATION = "conf/development.properties"; 13 | private final Properties properties = new Properties(); 14 | 15 | private Configuration() { 16 | try { 17 | properties.load(new FileInputStream(new File(System.getProperty("config", DEFAULT_CONFIG_FILE_LOCATION)))); 18 | } catch (IOException e) { 19 | throw new RuntimeException(e); 20 | } 21 | } 22 | 23 | public String getString(String key, String defaultValue) { 24 | return properties.getProperty(key, defaultValue); 25 | } 26 | 27 | public int getInt(String key, int defaultValue) { 28 | return Integer.parseInt(properties.getProperty(key, Integer.toString(defaultValue))); 29 | } 30 | 31 | public String getString(String key) { 32 | return properties.getProperty(key); 33 | } 34 | 35 | public int getInt(String key) { 36 | return Integer.parseInt(properties.getProperty(key)); 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "Configuration{" + 42 | "properties='" + properties + "'" + 43 | "}"; 44 | } 45 | } -------------------------------------------------------------------------------- /ui-commons/src/main/java/lt/tokenmill/crawling/commonui/ElasticSearch.java: -------------------------------------------------------------------------------- 1 | package lt.tokenmill.crawling.commonui; 2 | 3 | import lt.tokenmill.crawling.es.*; 4 | 5 | public class ElasticSearch { 6 | 7 | private static ElasticConnection CONNECTION; 8 | private static EsHttpSourceOperations HTTP_SOURCE_OPERATIONS; 9 | private static EsHttpSourceTestOperations HTTP_SOURCE_TEST_OPERATIONS; 10 | private static EsNamedQueryOperations NAMED_QUERY_OPERATIONS; 11 | private static EsDocumentOperations DOCUMENT_OPERATIONS; 12 | private static EsHttpUrlOperations URL_OPERATIONS; 13 | 14 | public static EsHttpSourceOperations getHttpSourceOperations() { 15 | if (HTTP_SOURCE_OPERATIONS == null) { 16 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_INDEX_NAME_PARAM); 17 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_DOC_TYPE_PARAM); 18 | HTTP_SOURCE_OPERATIONS = EsHttpSourceOperations.getInstance(getEsConnection(), index, type); 19 | } 20 | return HTTP_SOURCE_OPERATIONS; 21 | } 22 | 23 | public static EsHttpSourceTestOperations getHttpSourceTestOperations() { 24 | if (HTTP_SOURCE_TEST_OPERATIONS == null) { 25 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_INDEX_NAME_PARAM); 26 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_HTTP_SOURCES_TEST_TYPE_PARAM); 27 | HTTP_SOURCE_TEST_OPERATIONS = EsHttpSourceTestOperations.getInstance(getEsConnection(), index, type); 28 | } 29 | return HTTP_SOURCE_TEST_OPERATIONS; 30 | } 31 | 32 | public static EsNamedQueryOperations getNamedQueryOperations() { 33 | if (NAMED_QUERY_OPERATIONS == null) { 34 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_INDEX_PARAM); 35 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_NAMED_QUERIES_TYPE_PARAM); 36 | NAMED_QUERY_OPERATIONS = EsNamedQueryOperations.getInstance(getEsConnection(), index, type); 37 | } 38 | return NAMED_QUERY_OPERATIONS; 39 | } 40 | 41 | 42 | public static EsDocumentOperations getDocumentOperations() { 43 | if (DOCUMENT_OPERATIONS == null) { 44 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_INDEX_NAME_PARAM); 45 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_DOCS_DOC_TYPE_PARAM); 46 | DOCUMENT_OPERATIONS = EsDocumentOperations.getInstance(getEsConnection(), index, type); 47 | } 48 | return DOCUMENT_OPERATIONS; 49 | } 50 | 51 | public static EsHttpUrlOperations getUrlOperations() { 52 | if (URL_OPERATIONS == null) { 53 | String index = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_INDEX_NAME_PARAM); 54 | String type = Configuration.INSTANCE.getString(ElasticConstants.ES_URLS_DOC_TYPE_PARAM); 55 | URL_OPERATIONS = EsHttpUrlOperations.getInstance(getEsConnection(), index, type); 56 | } 57 | return URL_OPERATIONS; 58 | } 59 | 60 | private static ElasticConnection getEsConnection() { 61 | if (CONNECTION == null) { 62 | String hostname = Configuration.INSTANCE.getString(ElasticConstants.ES_HOSTNAME_PARAM, "localhost"); 63 | int restPort = Configuration.INSTANCE.getInt(ElasticConstants.ES_REST_PORT, 9200); 64 | String restScheme = Configuration.INSTANCE.getString(ElasticConstants.ES_REST_SCHEME, "http"); 65 | CONNECTION = ElasticConnection.getConnection(hostname, restPort, restScheme); 66 | } 67 | return CONNECTION; 68 | } 69 | } 70 | --------------------------------------------------------------------------------