├── .github └── FUNDING.yml ├── bin ├── restart.sh ├── start.sh ├── stop.sh ├── crawlstart.py └── start_crawler_docker.sh ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── .dockerignore ├── .gitmodules ├── .gitignore ├── .settings ├── org.eclipse.jdt.core.prefs └── org.eclipse.buildship.core.prefs ├── conf ├── indexer_blacklist_filetypes.txt ├── crawler_blacklist_localhost.txt └── config.properties ├── src └── main │ ├── resources │ └── log4j.properties │ └── java │ └── net │ └── yacy │ └── grid │ └── crawler │ ├── api │ ├── CrawlerDefaultValuesService.java │ └── CrawlStartService.java │ ├── Blacklist.java │ ├── Crawler.java │ └── CrawlerListener.java ├── .project ├── Dockerfile ├── .classpath ├── gradlew.bat ├── README.md └── gradlew /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: orbiter 2 | patreon: 0rb1t3r 3 | -------------------------------------------------------------------------------- /bin/restart.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | ./stop.sh 4 | sleep 1 5 | ./start.sh 6 | 7 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yacy/yacy_grid_crawler/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | data 4 | build 5 | bin 6 | docker 7 | Dockerfile 8 | LICENSE.md 9 | README.md 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/yacy_grid_mcp"] 2 | path = submodules/yacy_grid_mcp 3 | url = https://github.com/yacy/yacy_grid_mcp.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | classes/ 2 | target/ 3 | data/ 4 | /class/ 5 | /.gradle/ 6 | /build/ 7 | .DS_Store 8 | .settings 9 | .idea/ 10 | bin/ai/ 11 | bin/log4j.properties 12 | bin/net/ 13 | bin/org/ -------------------------------------------------------------------------------- /bin/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | cd .. 4 | nohup java -jar build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar < /dev/null & 5 | sleep 1 6 | echo "YaCy Grid Crawler started!" 7 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.source=1.8 5 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /conf/indexer_blacklist_filetypes.txt: -------------------------------------------------------------------------------- 1 | # Indexing Blacklist for bad file types 2 | 3 | .*?\.xml # Reject XML in search index 4 | .*?\.css # Reject CSS in search index 5 | .*?\.js # Reject JavaScript in search index 6 | .*?/robots\.txt # Reject robots.txt -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 10 | 11 | log4j.logger.org.eclipse.jetty = INFO 12 | log4j.logger.org.apache.http = INFO 13 | -------------------------------------------------------------------------------- /conf/crawler_blacklist_localhost.txt: -------------------------------------------------------------------------------- 1 | # Blacklist for local, private or intranet URLs 2 | 3 | .*?//localhost.*+ # Localhost host name 4 | .*?//127\..*+ # Localhost IPv4 5 | .*?//10\..*+ # Private IPv4 Class A Network 10.x.x.x 6 | .*?//172\.(1[6-9]|2[0-9]|3[0-1])\..*+ # Private IPv4 Class B Network 172.16.0.0 .. 172.31.255.255 7 | .*?//192\.168\..*+ # Private IPv4 Class C Network 192.168.0.0 .. 192.168.255.255 8 | .*?//^::1.*+ # Localhost IPv6 9 | .*?//[fF][cCdD].*+ # IPv6 User Local Address Space 10 | -------------------------------------------------------------------------------- /bin/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | cd ../data 4 | KILLFILE="crawler-8300.kill" 5 | PIDFILE="crawler-8300.pid" 6 | 7 | # first method to terminate the process 8 | if [ -f "$KILLFILE" ]; 9 | then 10 | rm $KILLFILE 11 | echo "termination requested, waiting.." 12 | # this can take 10 seconds.. 13 | sleep 10 14 | fi 15 | 16 | # second method to terminate the process 17 | if [ -f "$PIDFILE" ]; 18 | then 19 | fuser -k $PIDFILE 20 | fi 21 | 22 | # check if file does not exist any more which would be a sign that this has terminated 23 | if [ ! -f "$PIDFILE" ]; 24 | then 25 | echo "process terminated" 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /.settings/org.eclipse.buildship.core.prefs: -------------------------------------------------------------------------------- 1 | arguments= 2 | auto.sync=false 3 | build.commands=org.eclipse.jdt.core.javabuilder 4 | build.scans.enabled=false 5 | connection.arguments= 6 | connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(5.6.2)) 7 | connection.java.home=null 8 | connection.jvm.arguments= 9 | connection.project.dir= 10 | derived.resources=.gradle,build 11 | eclipse.preferences.version=1 12 | gradle.user.home= 13 | java.home= 14 | jvm.arguments= 15 | natures=org.eclipse.jdt.core.javanature 16 | offline.mode=false 17 | override.workspace.settings=true 18 | project.path=\: 19 | show.console.view=true 20 | show.executions.view=true 21 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | yacy_grid_crawler 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.buildship.core.gradleprojectbuilder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.buildship.core.gradleprojectnature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ## yacy_grid_crawler dockerfile 2 | ## examples: 3 | # docker build -t yacy_grid_crawler . 4 | # docker run -d --rm -p 8300:8300 --name yacy_grid_crawler yacy_grid_crawler 5 | ## Check if the service is running: 6 | # curl http://localhost:8300/yacy/grid/mcp/info/status.json 7 | 8 | # build app 9 | FROM eclipse-temurin:8-jdk-focal AS appbuilder 10 | COPY ./ /app 11 | WORKDIR /app 12 | RUN ./gradlew clean shadowDistTar 13 | 14 | # build dist 15 | FROM eclipse-temurin:8-jre-focal 16 | LABEL maintainer="Michael Peter Christen " 17 | ENV DEBIAN_FRONTEND noninteractive 18 | ARG default_branch=master 19 | COPY ./conf /app/conf/ 20 | COPY --from=appbuilder /app/build/libs/ ./app/build/libs/ 21 | WORKDIR /app 22 | EXPOSE 8300 23 | 24 | # for some weird reason the jar file is sometimes not named correctly 25 | RUN if [ -e /app/build/libs/app-0.0.1-SNAPSHOT-all.jar ] ; then mv /app/build/libs/app-0.0.1-SNAPSHOT-all.jar /app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar; fi 26 | 27 | CMD ["java", "-Xms320M", "-Xmx2G", "-jar", "/app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar"] 28 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /bin/crawlstart.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | import sys 4 | import requests 5 | import urllib.parse 6 | 7 | crawlingURL = sys.argv[1] 8 | parsed_url = urllib.parse.urlparse(crawlingURL) 9 | crawlingHost = parsed_url.netloc 10 | crawlingProtocol = parsed_url.scheme 11 | 12 | data = { 13 | 'cachePolicy': 'iffresh', 14 | 'collection': 'testcollection', 15 | 'crawlingstart': 'Start crawling', 16 | 'crawlingMode': 'url', 17 | 'crawlingQ': 'on', 18 | 'crawlingDepth': 1, 19 | 'crawlingDepthExtension': '', 20 | 'crawlingURL': crawlingURL, 21 | 'deleteIfOlderNumber': 1, 22 | 'deleteIfOlderUnit': 'day', 23 | 'deleteold': 'age', 24 | 'indexmustmatch': '^{0}.*'.format(crawlingURL), 25 | 'indexmustnotmatch': '', 26 | 'indexMedia': 'on', 27 | 'mustmatch': '^{protocol}://{host}/.*'.format(protocol=crawlingProtocol, host=crawlingHost), 28 | 'mustnotmatch': '', 29 | 'indexText': 'on', 30 | 'range': 'wide', 31 | 'recrawl': 'reload', 32 | 'reloadIfOlderNumber': 0, 33 | 'reloadIfOlderUnit': 'day', 34 | 'storeHTCache': 'on', 35 | 'xsstopw': 'on', 36 | 'priority': 0 37 | } 38 | 39 | res = requests.get('http://localhost:8300/yacy/grid/crawler/crawlStart.json', params=data) 40 | 41 | if res.status_code != 200: 42 | print("ERR :: error starting the crawler") 43 | print(res.text) 44 | else: 45 | print("INF :: successfully sent '{0}' to crawler".format(crawlingURL)) 46 | -------------------------------------------------------------------------------- /bin/start_crawler_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "`dirname $0`" 3 | 4 | bindhost="127.0.0.1" 5 | callhost="localhost" 6 | appname="YaCy Grid Crawler" 7 | containername=yacy-grid-crawler 8 | imagename=${containername//-/_} 9 | dockerfile="Dockerfile" 10 | production=false 11 | open=false 12 | 13 | usage() { echo "usage: $0 [-o | --open | -p | --production | --arm32 | --arm64 ]" 1>&2; exit 1; } 14 | 15 | while [[ $# -gt 0 ]]; do 16 | case "$1" in 17 | -p | --production ) production=true; shift 1;; 18 | -o | --open ) open=true; shift 1;; 19 | --arm32 ) imagename=${imagename}:arm32; dockerfile=${dockerfile}_arm32; shift 1;; 20 | --arm64 ) imagename=${imagename}:arm64; dockerfile=${dockerfile}_arm64; shift 1;; 21 | -h | --help | -* | --* | * ) usage;; 22 | esac 23 | done 24 | if [ "$production" = true ] ; then imagename="yacy/${imagename}"; fi 25 | if [ "$open" = true ] ; then bindhost="0.0.0.0"; callhost=`hostname`; fi 26 | 27 | containerRuns=$(docker ps | grep -i "${containername}" | wc -l ) 28 | containerExists=$(docker ps -a | grep -i "${containername}" | wc -l ) 29 | if [ ${containerRuns} -gt 0 ]; then 30 | echo "${appname} container is already running" 31 | elif [ ${containerExists} -gt 0 ]; then 32 | docker start ${containername} 33 | echo "${appname} container re-started" 34 | else 35 | if [[ $imagename != "yacy/"*":latest" ]] && [[ "$(docker images -q ${imagename} 2> /dev/null)" == "" ]]; then 36 | cd .. 37 | docker build -t ${imagename} -f ${dockerfile} . 38 | cd bin 39 | fi 40 | docker run -d --restart=unless-stopped -p ${bindhost}:8300:8300 \ 41 | --link yacy-grid-minio --link yacy-grid-rabbitmq --link yacy-grid-elasticsearch --link yacy-grid-mcp \ 42 | -e YACYGRID_GRID_MCP_ADDRESS=yacy-grid-mcp \ 43 | --name ${containername} ${imagename} 44 | echo "${appname} started." 45 | fi 46 | docker ps -a --format "table {{.ID}}\t{{.Image}}\t{{.Names}}\t{{.Mounts}}\t{{.Ports}}" 47 | 48 | echo "To get the app status, open http://${callhost}:8300/yacy/grid/mcp/info/status.json" 49 | -------------------------------------------------------------------------------- /conf/config.properties: -------------------------------------------------------------------------------- 1 | port = 8300 2 | grid.mcp.address = 127.0.0.1:8100,node00.local:8100,brain.local:8100,searchlab.eu:8100 3 | grid.broker.lazy = true 4 | grid.broker.queue.limit = 0 5 | grid.broker.queue.throttling = 100000 6 | grid.assets.delete = true 7 | 8 | # The blacklist is choosen with the attribute grid.crawler.blacklist which gives the file name(s) of the blacklist(s) to be used. 9 | # To use your own blacklist, create a file in data/crawler-8300/conf/ and set the name of it 10 | # in the attribute grid.crawler.blacklist. 11 | # 12 | # You can use several blacklists simultanously, just comma-separate the names of the file names. 13 | # all files in the path conf/ and data/crawler-8300/conf/ are found. 14 | # The same applies to files in parallel processes like data/crawler-8301/conf/ and so on. 15 | # 16 | # The file format of the blacklist is: 17 | # - it is a plain text file in UTF-8 encoding 18 | # - every line beginning with '#' is a comment and is ignored 19 | # - every string, matching with ' #.*' is removed. This cuts away comments from the end of a line. 20 | # - every blank line is ignored 21 | # - every other line must contain a regular expression according to 22 | # https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html 23 | # which is considered as a matcher pattern (not a find pattern) for an URL. 24 | # Lines containing a regular expression get their leading and trailing spaces removed. 25 | # 26 | # All regular expressions are considered to be a disjunction (OR logic) for the filtering of cralwing urls. 27 | # URLs are normalized before a matching is attempted, that means they are encoded propery 28 | # and the fragment identifier is removed from the end of the URL. 29 | grid.crawler.blacklist = crawler_blacklist_someonewhocares.txt,crawler_blacklist_localhost.txt 30 | grid.indexer.blacklist = indexer_blacklist_filetypes.txt 31 | grid.indexer.priorityQueues = 2 32 | 33 | 34 | 35 | #################################################################### 36 | ## The following properties must be identical to those in the MCP ## 37 | #################################################################### 38 | 39 | # The grid name is used to separate different grid networks. 40 | # Only networks with the same name connect with each other 41 | grid.name = freeworld 42 | 43 | # Index names of the grid indexes: 44 | # crawlstart : a history of all crawl starts 45 | # crawler : tracking of crawling progress 46 | # query : a history of all queries 47 | # web : the document search index ("web index", there) 48 | grid.elasticsearch.indexName.crawlstart = crawlstart 49 | grid.elasticsearch.indexName.crawler = crawler 50 | grid.elasticsearch.indexName.query = query 51 | grid.elasticsearch.indexName.web = web 52 | 53 | # the following type name is an intermediate solution to migrate from elastic 6.x to 8.x 54 | # unfortunately the current index type name is 'web' but in future elastic versions the name '_doc' 55 | # is mandatory. We will use this setting until migration to elastic 8.x is complete and delete 56 | # the configuration afterwards. 57 | grid.elasticsearch.typeName = web -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/crawler/api/CrawlerDefaultValuesService.java: -------------------------------------------------------------------------------- 1 | /** 2 | * CrawlerDefaultValuesService 3 | * Copyright 04.6.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.crawler.api; 21 | 22 | import javax.servlet.http.HttpServletResponse; 23 | 24 | import org.json.JSONObject; 25 | 26 | import net.yacy.grid.contracts.User; 27 | import net.yacy.grid.http.APIHandler; 28 | import net.yacy.grid.http.ObjectAPIHandler; 29 | import net.yacy.grid.http.Query; 30 | import net.yacy.grid.http.ServiceResponse; 31 | 32 | /** 33 | * 34 | * Test URL: 35 | * http://localhost:8300/yacy/grid/crawler/defaultValues.json 36 | * 37 | * Test command: 38 | * curl http://localhost:8300/yacy/grid/crawler/defaultValues.json 39 | */ 40 | public class CrawlerDefaultValuesService extends ObjectAPIHandler implements APIHandler { 41 | 42 | private static final long serialVersionUID = 8578474303031749879L; 43 | public static final String NAME = "defaultValues"; 44 | 45 | public static JSONObject defaultValues = new JSONObject(true); 46 | static { 47 | defaultValues.put("crawlingMode", "url"); 48 | defaultValues.put("crawlingURL", ""); 49 | defaultValues.put("sitemapURL", ""); 50 | defaultValues.put("crawlingFile", ""); 51 | defaultValues.put("crawlingDepth", 3); 52 | defaultValues.put("crawlingDepthExtension", ""); 53 | defaultValues.put("range", "domain"); 54 | defaultValues.put("mustmatch", ".*"); 55 | defaultValues.put("mustnotmatch", ".*\\.(js|css|jpg|jpeg|png|dmg|mpg|mpeg|zip|gz|exe|pkg)"); 56 | defaultValues.put("ipMustmatch", ".*"); 57 | defaultValues.put("ipMustnotmatch", ""); 58 | defaultValues.put("indexmustmatch", ".*"); 59 | defaultValues.put("indexmustnotmatch", ""); 60 | defaultValues.put("deleteold", "off"); 61 | defaultValues.put("deleteIfOlderNumber", 0); 62 | defaultValues.put("deleteIfOlderUnit", "day"); 63 | defaultValues.put("recrawl", "nodoubles"); 64 | defaultValues.put("reloadIfOlderNumber", 0); 65 | defaultValues.put("reloadIfOlderUnit", "day"); 66 | defaultValues.put("crawlingDomMaxCheck", "off"); 67 | defaultValues.put("crawlingDomMaxPages", 1000); 68 | defaultValues.put("crawlingQ", "off"); 69 | defaultValues.put("cachePolicy", "if fresh"); 70 | defaultValues.put("collection", "user"); // corpus name 71 | defaultValues.put("agentName", ""); 72 | defaultValues.put("priority", 0); 73 | defaultValues.put("loaderHeadless", "false"); 74 | defaultValues.put("user_id", User.ANONYMOUS_ID); 75 | defaultValues.put("storeAssets", "false"); 76 | defaultValues.put("archiveWARC", "false"); 77 | defaultValues.put("archiveIndex", "false"); 78 | defaultValues.put("archiveGraph", "false"); 79 | } 80 | 81 | @Override 82 | public String getAPIPath() { 83 | return "/yacy/grid/crawler/" + NAME + ".json"; 84 | } 85 | 86 | public static JSONObject crawlStartDefaultClone() { 87 | final JSONObject json = new JSONObject(true); 88 | defaultValues.keySet().forEach(key -> json.put(key, defaultValues.get(key))); 89 | return json; 90 | } 91 | 92 | @Override 93 | public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) { 94 | return new ServiceResponse(defaultValues); 95 | } 96 | 97 | } 98 | 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YaCy Grid Component: Crawler 2 | 3 | The YaCy Grid is the second-generation implementation of YaCy, a peer-to-peer search engine. 4 | A YaCy Grid installation consists of a set of micro-services which communicate with each other 5 | using the MCP, see https://github.com/yacy/yacy_grid_mcp 6 | 7 | ## Purpose 8 | 9 | The Crawler is a microservices which can be deployed i.e. using Docker. When the Crawler Component 10 | is started, it searches for a MCP and connect to it. By default the local host is searched for a 11 | MCP but you can configure one yourself. 12 | 13 | ## What it does 14 | 15 | The Crawler then does the following: 16 | 17 | ``` 18 | while (a Crawl Contract is in the queue crawler_pending) do 19 | - read the target url from the contract 20 | - check against the search index if the url is registered in the transaction index as 'to-be-parsed'. If not, continue 21 | - load the url content from the assets (it must have been loaded before! - that is another process) 22 | - parse the content and create a YaCy JSON object with that content 23 | - place the YaCy JSON within a contract in the index_pending queue 24 | - extract all links from the YaCy JSON 25 | - check the validity of the links using the crawl contract 26 | - all remaining urls are checked against the transaction index, all existing urls are discarded 27 | - write an index entry for the remaining urls with status 'to-be-loaded' 28 | - and these remaining urls are placed onto the loader_pending queue 29 | - the status of the target url is set to to-be-indexed 30 | od 31 | ``` 32 | ## Required Infrastructure (Search Index, Asset Storage and Message Queues) 33 | 34 | This requires an transaction index with the following information: 35 | * `URL` (as defined with https://tools.ietf.org/html/rfc3986) 36 | * `crawlid` (a hash) 37 | * status (`to-be-loaded`, `to-be-parsed`, `to-be-indexed`, `indexed`) 38 | As long as a crawl process is running, new urls (as discovered in the html source of a target url) 39 | must be written to the transaction index before the target url has a status change (from to-be-parsed to to-be-indexed). 40 | This makes it possible that the status of a crawl job and the fact that it has been terminted can be 41 | discovered from the transaction index. 42 | * if all status entries for a single `crawlid` are `indexed` then the crawl has been terminated. 43 | The Crawl process needs another database index, which contains the crawl description. The content must be almost the same as 44 | describe in http://www.yacy-websuche.de/wiki/index.php/Dev:APICrawler 45 | 46 | Every loader and parser microservice must read this crawl profile information. Because that information is required 47 | many times, we omit a request into the cawler index by adding the crawler profile into each contract of a crawl job in the 48 | crawler_pending and loader_pending queue. 49 | 50 | The crawl is therefore controlled by those queues: 51 | * `loader_pending` queue: entries which the yacy_grid_loader process reads. This process loads given resources and writes them to the asset storage. 52 | * `crawler_pending`queue: entries which the yacy_grid_crawler process reads. This process loads the content from the asset storage, parses the content and creates new loader_pending tasks. 53 | 54 | The required indexes are: 55 | * a crawl profile index 56 | * a transaction index which reflects the crawl status 57 | * a search index 58 | 59 | The microservices will create these indexes on their own using the MCP component. 60 | 61 | ## Installation: Download, Build, Run 62 | At this time, yacy_grid_crawler is not provided in compiled form, you easily build it yourself. It's not difficult and done in one minute! The source code is hosted at https://github.com/yacy/yacy_grid_crawler, you can download it and run loklak with: 63 | 64 | > git clone --recursive https://github.com/yacy/yacy_grid_crawler.git 65 | 66 | If you just want to make a update, do the following 67 | 68 | > git pull origin master 69 | > git submodule foreach git pull origin master 70 | 71 | To build and start the crawler, run 72 | 73 | > cd yacy_grid_crawler 74 | > gradle run 75 | 76 | Please read also https://github.com/yacy/yacy_grid_mcp/edit/master/README.md for further details. 77 | 78 | ## Contribute 79 | 80 | This is a community project and your contribution is welcome! 81 | 82 | 1. Check for [open issues](https://github.com/yacy/yacy_grid_crawler/issues) 83 | or open a fresh one to start a discussion around a feature idea or a bug. 84 | 2. Fork [the repository](https://github.com/yacy/yacy_grid_crawler.git) 85 | on GitHub to start making your changes (branch off of the master branch). 86 | 3. Write a test that shows the bug was fixed or the feature works as expected. 87 | 4. Send a pull request and bug us on Gitter until it gets merged and published. :) 88 | 89 | ## What is the software license? 90 | LGPL 2.1 91 | 92 | Have fun! 93 | 94 | @0rb1t3r 95 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/crawler/Blacklist.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Blacklist 3 | * Copyright 17.02.2018 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | 21 | package net.yacy.grid.crawler; 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.nio.charset.StandardCharsets; 26 | import java.nio.file.Files; 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | import java.util.concurrent.atomic.AtomicInteger; 30 | import java.util.regex.Matcher; 31 | import java.util.regex.Pattern; 32 | import java.util.regex.PatternSyntaxException; 33 | 34 | import net.yacy.grid.tools.ARC; 35 | import net.yacy.grid.tools.HashARC; 36 | import net.yacy.grid.tools.Logger; 37 | import net.yacy.grid.tools.MultiProtocolURL; 38 | 39 | /** 40 | * A blacklist class to test if an URL is blacklisted. 41 | * This class has no object synchronization and it must not be used in concurrent environment. 42 | * The lack of concurrency is done on purpose. Each concurrent thread must initialize it's own blacklist. 43 | * This ensures that no concurrency issue appears between threads using the same blacklist. 44 | */ 45 | public class Blacklist { 46 | 47 | private final ARC blacklistHitCache; 48 | private final ARC blacklistMissCache; 49 | private final List blacklist; 50 | 51 | public Blacklist() { 52 | this.blacklist = new ArrayList<>(); 53 | this.blacklistHitCache = new HashARC<>(100000); 54 | this.blacklistMissCache = new HashARC<>(100000); 55 | } 56 | 57 | public void load(File f) throws IOException { 58 | final AtomicInteger counter = new AtomicInteger(0); 59 | Files.lines(f.toPath(), StandardCharsets.UTF_8).forEach(line -> { 60 | line = line.trim(); 61 | int p = line.indexOf(" #"); 62 | String info = ""; 63 | if (p >= 0) { 64 | info = line.substring(p + 1).trim(); 65 | line = line.substring(0, p); 66 | } 67 | line = line.trim(); 68 | if (!line.isEmpty() && !line.startsWith("#")) { 69 | if (line.startsWith("host ")) { 70 | String host = line.substring(5).trim(); 71 | try { 72 | BlacklistInfo bi = new BlacklistInfo(".*?//" + host + "/.*+", f.getName(), info, host); 73 | this.blacklist.add(bi); 74 | counter.incrementAndGet(); 75 | } catch (PatternSyntaxException e) { 76 | Logger.warn(this.getClass(), "regex for host in file " + f.getName() + " cannot be compiled: " + line.substring(5).trim()); 77 | } 78 | } else { 79 | try { 80 | BlacklistInfo bi = new BlacklistInfo(line, f.getName(), info, null); 81 | this.blacklist.add(bi); 82 | counter.incrementAndGet(); 83 | } catch (PatternSyntaxException e) { 84 | Logger.warn(this.getClass(), "regex for url in file " + f.getName() + " cannot be compiled: " + line); 85 | } 86 | } 87 | } 88 | }); 89 | Logger.info(this.getClass(), "loaded " + counter.get() + " blacklist entries from file " + f.getName()); 90 | } 91 | 92 | public final static class BlacklistInfo { 93 | public final Matcher matcher; 94 | public final String source; 95 | public final String info; 96 | public final String host; 97 | public BlacklistInfo(final String patternString, final String source, final String info, final String host) throws PatternSyntaxException { 98 | this.matcher = Pattern.compile(patternString).matcher(""); 99 | this.source = source; 100 | this.info = info; 101 | this.host = host; 102 | } 103 | } 104 | 105 | public BlacklistInfo isBlacklisted(String url, MultiProtocolURL u) { 106 | BlacklistInfo cachedBI = this.blacklistHitCache.get(url); 107 | if (cachedBI != null) return cachedBI; 108 | Boolean cachedMiss = this.blacklistMissCache.get(url); 109 | if (cachedMiss != null) return null; 110 | for (BlacklistInfo bi: this.blacklist) { 111 | if (u != null && bi.host != null) { 112 | if (u.getHost().equals(bi.host)) { 113 | return bi; 114 | } 115 | } else { 116 | bi.matcher.reset(url); 117 | //Thread.currentThread().setName(bi.matcher.pattern().pattern() + " -> " + url); 118 | if (bi.matcher.matches()) { 119 | this.blacklistHitCache.put(url, bi); 120 | return bi; 121 | } 122 | } 123 | } 124 | this.blacklistMissCache.put(url, Boolean.TRUE); 125 | return null; 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/crawler/Crawler.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Crawler 3 | * Copyright 25.04.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.crawler; 21 | 22 | import java.util.ArrayList; 23 | import java.util.Arrays; 24 | import java.util.List; 25 | 26 | import javax.servlet.Servlet; 27 | 28 | import net.yacy.grid.YaCyServices; 29 | import net.yacy.grid.crawler.api.CrawlStartService; 30 | import net.yacy.grid.crawler.api.CrawlerDefaultValuesService; 31 | import net.yacy.grid.mcp.BrokerListener; 32 | import net.yacy.grid.mcp.Configuration; 33 | import net.yacy.grid.mcp.MCP; 34 | import net.yacy.grid.mcp.Service; 35 | import net.yacy.grid.tools.CronBox; 36 | import net.yacy.grid.tools.CronBox.Telemetry; 37 | import net.yacy.grid.tools.Logger; 38 | 39 | /** 40 | * The Crawler main class 41 | * 42 | * performance debugging: 43 | * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt 44 | * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt?count=100 45 | */ 46 | public class Crawler { 47 | 48 | private final static YaCyServices CRAWLER_SERVICE = YaCyServices.crawler; 49 | private final static String DATA_PATH = "data"; 50 | 51 | // define services 52 | @SuppressWarnings("unchecked") 53 | public final static Class[] CRAWLER_SERVICES = new Class[]{ 54 | CrawlerDefaultValuesService.class, 55 | CrawlStartService.class 56 | }; 57 | 58 | public static class Application implements CronBox.Application { 59 | 60 | final Configuration config; 61 | final Service service; 62 | final BrokerListener brokerApplication; 63 | final CronBox.Application serviceApplication; 64 | 65 | public Application() { 66 | Logger.info("Starting Crawler Application..."); 67 | 68 | // initialize configuration 69 | final List> services = new ArrayList<>(); 70 | services.addAll(Arrays.asList(MCP.MCP_SERVLETS)); 71 | services.addAll(Arrays.asList(CRAWLER_SERVICES)); 72 | this.config = new Configuration(DATA_PATH, true, CRAWLER_SERVICE, services.toArray(new Class[services.size()])); 73 | final int priorityQueues = Integer.parseInt(this.config.properties.get("grid.indexer.priorityQueues")); 74 | CrawlerListener.initPriorityQueue(priorityQueues); 75 | 76 | // initialize REST server with services 77 | this.service = new Service(this.config); 78 | 79 | // connect backend 80 | this.config.connectBackend(); 81 | 82 | // initiate broker application: listening to indexing requests at RabbitMQ 83 | this.brokerApplication = new CrawlerListener(this.config, CRAWLER_SERVICE); 84 | 85 | // initiate service application: listening to REST request 86 | this.serviceApplication = this.service.newServer(null); 87 | } 88 | 89 | @Override 90 | public void run() { 91 | 92 | Logger.info("Grid Name: " + this.config.properties.get("grid.name")); 93 | 94 | // starting threads 95 | new Thread(this.brokerApplication).start(); 96 | this.serviceApplication.run(); // SIC! the service application is running as the core element of this run() process. If we run it concurrently, this runnable will be "dead". 97 | } 98 | 99 | @Override 100 | public void stop() { 101 | Logger.info("Stopping Crawler Application..."); 102 | this.serviceApplication.stop(); 103 | this.brokerApplication.stop(); 104 | this.service.stop(); 105 | this.service.close(); 106 | this.config.close(); 107 | } 108 | 109 | @Override 110 | public Telemetry getTelemetry() { 111 | return null; 112 | } 113 | 114 | } 115 | 116 | public static void main(final String[] args) { 117 | // run in headless mode 118 | System.setProperty("java.awt.headless", "true"); // no awt used here so we can switch off that stuff 119 | 120 | // Debug Info 121 | boolean assertionenabled = false; 122 | assert (assertionenabled = true) == true; // compare to true to remove warning: "Possible accidental assignement" 123 | if (assertionenabled) Logger.info("Asserts are enabled"); 124 | 125 | // first greeting 126 | Logger.info("YaCy Grid Crawler started!"); 127 | 128 | // run application with cron 129 | final long cycleDelay = Long.parseLong(System.getProperty("YACYGRID_CRAWLER_CYCLEDELAY", "" + Long.MAX_VALUE)); // by default, run only in one genesis thread 130 | final int cycleRandom = Integer.parseInt(System.getProperty("YACYGRID_CRAWLER_CYCLERANDOM", "" + 1000 * 60 /*1 minute*/)); 131 | final CronBox cron = new CronBox(Application.class, cycleDelay, cycleRandom); 132 | cron.cycle(); 133 | 134 | // this line is reached if the cron process was shut down 135 | Logger.info("YaCy Grid Crawler terminated"); 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 84 | 85 | APP_NAME="Gradle" 86 | APP_BASE_NAME=${0##*/} 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | MAX_FD=$( ulimit -H -n ) || 147 | warn "Could not query maximum file descriptor limit" 148 | esac 149 | case $MAX_FD in #( 150 | '' | soft) :;; #( 151 | *) 152 | ulimit -n "$MAX_FD" || 153 | warn "Could not set maximum file descriptor limit to $MAX_FD" 154 | esac 155 | fi 156 | 157 | # Collect all arguments for the java command, stacking in reverse order: 158 | # * args from the command line 159 | # * the main class name 160 | # * -classpath 161 | # * -D...appname settings 162 | # * --module-path (only if needed) 163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 164 | 165 | # For Cygwin or MSYS, switch paths to Windows format before running java 166 | if "$cygwin" || "$msys" ; then 167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 169 | 170 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 171 | 172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 173 | for arg do 174 | if 175 | case $arg in #( 176 | -*) false ;; # don't mess with options #( 177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 178 | [ -e "$t" ] ;; #( 179 | *) false ;; 180 | esac 181 | then 182 | arg=$( cygpath --path --ignore --mixed "$arg" ) 183 | fi 184 | # Roll the args list around exactly as many times as the number of 185 | # args, so each arg winds up back in the position where it started, but 186 | # possibly modified. 187 | # 188 | # NB: a `for` loop captures its iteration list before it begins, so 189 | # changing the positional parameters here affects neither the number of 190 | # iterations, nor the values presented in `arg`. 191 | shift # remove old arg 192 | set -- "$@" "$arg" # push replacement arg 193 | done 194 | fi 195 | 196 | # Collect all arguments for the java command; 197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 198 | # shell script including quotes and variable substitutions, so put them in 199 | # double quotes to make sure that they get re-expanded; and 200 | # * put everything else in single quotes, so that it's not re-expanded. 201 | 202 | set -- \ 203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 204 | -classpath "$CLASSPATH" \ 205 | org.gradle.wrapper.GradleWrapperMain \ 206 | "$@" 207 | 208 | # Use "xargs" to parse quoted args. 209 | # 210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 211 | # 212 | # In Bash we could simply go: 213 | # 214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 215 | # set -- "${ARGS[@]}" "$@" 216 | # 217 | # but POSIX shell has neither arrays nor command substitution, so instead we 218 | # post-process each arg (as a line of input to sed) to backslash-escape any 219 | # character that might be a shell metacharacter, then use eval to reverse 220 | # that process (while maintaining the separation between arguments), and wrap 221 | # the whole thing up as a single "set" statement. 222 | # 223 | # This will of course break if any of these variables contains a newline or 224 | # an unmatched quote. 225 | # 226 | 227 | eval "set -- $( 228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 229 | xargs -n1 | 230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 231 | tr '\n' ' ' 232 | )" '"$@"' 233 | 234 | exec "$JAVACMD" "$@" 235 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/crawler/api/CrawlStartService.java: -------------------------------------------------------------------------------- 1 | /** 2 | * CrawlStartService 3 | * Copyright 12.6.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.crawler.api; 21 | 22 | import java.io.IOException; 23 | import java.nio.charset.StandardCharsets; 24 | import java.util.Date; 25 | import java.util.Map; 26 | import java.util.regex.Pattern; 27 | 28 | import javax.servlet.http.HttpServletResponse; 29 | 30 | import org.json.JSONArray; 31 | import org.json.JSONObject; 32 | 33 | import ai.susi.mind.SusiAction; 34 | import ai.susi.mind.SusiThought; 35 | import net.yacy.grid.YaCyServices; 36 | import net.yacy.grid.contracts.User; 37 | import net.yacy.grid.crawler.CrawlerListener; 38 | import net.yacy.grid.http.APIHandler; 39 | import net.yacy.grid.http.ObjectAPIHandler; 40 | import net.yacy.grid.http.Query; 41 | import net.yacy.grid.http.ServiceResponse; 42 | import net.yacy.grid.io.index.CrawlerMapping; 43 | import net.yacy.grid.io.index.CrawlstartDocument; 44 | import net.yacy.grid.io.index.CrawlstartMapping; 45 | import net.yacy.grid.io.index.GridIndex; 46 | import net.yacy.grid.io.index.Index.QueryLanguage; 47 | import net.yacy.grid.io.index.WebMapping; 48 | import net.yacy.grid.io.messages.GridQueue; 49 | import net.yacy.grid.io.messages.ShardingMethod; 50 | import net.yacy.grid.mcp.Service; 51 | import net.yacy.grid.tools.Digest; 52 | import net.yacy.grid.tools.Domains; 53 | import net.yacy.grid.tools.JSONList; 54 | import net.yacy.grid.tools.Logger; 55 | import net.yacy.grid.tools.MultiProtocolURL; 56 | 57 | /** 58 | * 59 | * Test URL: 60 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=yacy.net&indexmustnotmatch=.*Mitmachen.*&mustmatch=.*yacy.net.* 61 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=ix.de&crawlingDepth=6&priority=true 62 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=tagesschau.de&loaderHeadless=false 63 | * 64 | * then check crawl queue status at http://localhost:15672/ 65 | * default account is guest:guest 66 | */ 67 | public class CrawlStartService extends ObjectAPIHandler implements APIHandler { 68 | 69 | private static final long serialVersionUID = 8578474303031749879L; 70 | public static final String NAME = "crawlStart"; 71 | 72 | @Override 73 | public String getAPIPath() { 74 | return "/yacy/grid/crawler/" + NAME + ".json"; 75 | } 76 | 77 | @Override 78 | public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) { 79 | final JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone(); 80 | 81 | // read call attributes using the default crawlstart key names 82 | for (final String key: crawlstart.keySet()) { 83 | final Object object = crawlstart.get(key); 84 | if (object instanceof String) crawlstart.put(key, call.get(key, crawlstart.getString(key))); 85 | else if (object instanceof Integer) crawlstart.put(key, call.get(key, crawlstart.getInt(key))); 86 | else if (object instanceof Long) crawlstart.put(key, call.get(key, crawlstart.getLong(key))); 87 | else if (object instanceof JSONArray) { 88 | final JSONArray a = crawlstart.getJSONArray(key); 89 | final Object cv = call.get(key); 90 | if (cv != null) crawlstart.put(key, cv); 91 | } else { 92 | System.out.println("unrecognized type: " + object.getClass().toString()); 93 | } 94 | } 95 | final String user_id = crawlstart.optString("user_id", User.ANONYMOUS_ID); 96 | 97 | // fix attributes 98 | final int crawlingDepth = crawlstart.optInt("crawlingDepth", 3); 99 | crawlstart.put("crawlingDepth", Math.min(crawlingDepth, 8)); // crawlingDepth shall not exceed 8 - this is used for enhanced balancing to be able to reach crawl leaves 100 | final String mustmatch = crawlstart.optString("mustmatch", CrawlerDefaultValuesService.defaultValues.getString("mustmatch")).trim(); 101 | crawlstart.put("mustmatch", mustmatch); 102 | final Map collections = WebMapping.collectionParser(crawlstart.optString("collection").trim()); 103 | 104 | // set the crawl id 105 | final CrawlerListener.CrawlstartURLSplitter crawlstartURLs = new CrawlerListener.CrawlstartURLSplitter(crawlstart.getString("crawlingURL")); 106 | final Date now = new Date(); 107 | // start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts 108 | final SusiThought allCrawlstarts = new SusiThought(); 109 | int count = 0; 110 | for (final MultiProtocolURL url: crawlstartURLs.getURLs()) { 111 | final JSONObject singlecrawl = new JSONObject(); 112 | for (final String key: crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key)); // create a clone of crawlstart 113 | final String crawl_id = CrawlerListener.getCrawlID(url, now, count++); 114 | final String start_url = url.toNormalform(true); 115 | final String start_ssld = Domains.getSmartSLD(url.getHost()); 116 | singlecrawl.put("id", crawl_id); 117 | singlecrawl.put("user_id", user_id); 118 | singlecrawl.put("start_url", start_url); 119 | singlecrawl.put("start_ssld", start_ssld); 120 | 121 | //singlecrawl.put("crawlingURLs", new JSONArray().put(url.toNormalform(true))); 122 | 123 | try { 124 | // Create a crawlstart index entry: this will keep track of all crawls that have been started. 125 | // once such an entry is created, it is never changed or deleted again by any YaCy Grid process. 126 | final CrawlstartDocument crawlstartDoc = new CrawlstartDocument() 127 | .setCrawlID(crawl_id) 128 | .setUserID(user_id) 129 | .setMustmatch(mustmatch) 130 | .setCollections(collections.keySet()) 131 | .setCrawlstartURL(start_url) 132 | .setCrawlstartSSLD(start_ssld) 133 | .setInitDate(now) 134 | .setData(singlecrawl); 135 | crawlstartDoc.store(Service.instance.config, Service.instance.config.gridIndex); 136 | 137 | // Create a crawler url tracking index entry: this will keep track of single urls and their status 138 | // While it is processed. The entry also serves as a double-check entry to terminate a crawl even if the 139 | // crawler is restarted. 140 | 141 | // delete the start url 142 | final String url_id = Digest.encodeMD5Hex(start_url); 143 | final String crawlerIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER); 144 | final String crawlstartIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawlstart", GridIndex.DEFAULT_INDEXNAME_CRAWLSTART); 145 | long deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"_id\":\"" + url_id + "\"}"); 146 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for _id"); 147 | 148 | // Because 'old' crawls may block new ones we identify possible blocking entries using the mustmatch pattern. 149 | // We therefore delete all entries with the same mustmatch pattern before a crawl starts. 150 | if (mustmatch.equals(".*")) { 151 | // we cannot delete all wide crawl status urls! 152 | final JSONList old_crawls = Service.instance.config.gridIndex.query(crawlstartIndexName, QueryLanguage.fields, "{ \"" + CrawlstartMapping.start_url_s.name() + "\":\"" + start_url + "\"}", 0, 100); 153 | // from there we pick out the crawl start id and delete using them 154 | for (final Object j: old_crawls.toArray()) { 155 | final String crawlid = ((JSONObject) j).optString(CrawlstartMapping.crawl_id_s.name()); 156 | if (crawlid.length() > 0) { 157 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.crawl_id_s.name() + "\":\"" + crawlid + "\"}"); 158 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for crawl_id_s"); 159 | } 160 | } 161 | // we also delete all entries with same start_url and start_ssld 162 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_url_s.name() + "\":\"" + start_url + "\"}"); 163 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_url_s"); 164 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_ssld_s.name() + "\":\"" + start_ssld + "\"}"); 165 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_ssld_s"); 166 | } else { 167 | // this should fit exactly on the old urls 168 | // test url: 169 | // curl -s -H 'Content-Type: application/json' -X GET http://localhost:9200/crawler/_search?q=_id:0a800a8ec1cc76b5eb8412ec494babc9 | python3 -m json.tool 170 | final String deletequery = "{ \"" + CrawlerMapping.mustmatch_s.name() + "\":\"" + mustmatch.replace("\\", "\\\\") + "\"}"; 171 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, deletequery); 172 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries"); 173 | } 174 | // we do not create a crawler document entry here because that would conflict with the double check. 175 | // crawler documents must be written after the double check has happened. 176 | 177 | // create a crawl queue entry 178 | final GridQueue queueName = Service.instance.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.BALANCE, CrawlerListener.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost()); 179 | final SusiThought json = new SusiThought(); 180 | json.setData(new JSONArray().put(singlecrawl)); 181 | final JSONObject action = new JSONObject() 182 | .put("type", YaCyServices.crawler.name()) 183 | .put("queue", queueName.name()) 184 | .put("id", crawl_id) 185 | .put("user_id", user_id) 186 | .put("depth", 0) 187 | .put("sourcegraph", "rootasset"); 188 | final SusiAction crawlAction = new SusiAction(action); 189 | final JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), start_url); 190 | crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph)); 191 | json.addAction(crawlAction); 192 | allCrawlstarts.addAction(crawlAction); 193 | final byte[] b = json.toString().getBytes(StandardCharsets.UTF_8); 194 | Service.instance.config.gridBroker.send(YaCyServices.crawler, queueName, b); 195 | 196 | } catch (final IOException e) { 197 | Logger.warn(this.getClass(), "error when starting crawl for " + url.toNormalform(true), e); 198 | allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage()); 199 | } 200 | } 201 | 202 | // construct a crawl start message 203 | allCrawlstarts.setData(new JSONArray().put(crawlstart)); 204 | allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0); 205 | 206 | // finally add the crawl start on the queue 207 | return new ServiceResponse(allCrawlstarts); 208 | } 209 | 210 | } 211 | 212 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/crawler/CrawlerListener.java: -------------------------------------------------------------------------------- 1 | package net.yacy.grid.crawler; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.Serializable; 7 | import java.net.MalformedURLException; 8 | import java.nio.charset.StandardCharsets; 9 | import java.text.SimpleDateFormat; 10 | import java.util.ArrayList; 11 | import java.util.Collection; 12 | import java.util.Date; 13 | import java.util.HashMap; 14 | import java.util.HashSet; 15 | import java.util.Iterator; 16 | import java.util.List; 17 | import java.util.Locale; 18 | import java.util.Map; 19 | import java.util.Set; 20 | import java.util.concurrent.ConcurrentHashMap; 21 | import java.util.regex.Pattern; 22 | 23 | import org.json.JSONArray; 24 | import org.json.JSONObject; 25 | 26 | import ai.susi.mind.SusiAction; 27 | import ai.susi.mind.SusiThought; 28 | import net.yacy.grid.Services; 29 | import net.yacy.grid.YaCyServices; 30 | import net.yacy.grid.contracts.User; 31 | import net.yacy.grid.io.assets.Asset; 32 | import net.yacy.grid.io.index.CrawlerDocument; 33 | import net.yacy.grid.io.index.CrawlerDocument.Status; 34 | import net.yacy.grid.io.index.GridIndex; 35 | import net.yacy.grid.io.index.WebMapping; 36 | import net.yacy.grid.io.messages.GridQueue; 37 | import net.yacy.grid.io.messages.ShardingMethod; 38 | import net.yacy.grid.mcp.AbstractBrokerListener; 39 | import net.yacy.grid.mcp.BrokerListener; 40 | import net.yacy.grid.mcp.Configuration; 41 | import net.yacy.grid.tools.Classification.ContentDomain; 42 | import net.yacy.grid.tools.CronBox.Telemetry; 43 | import net.yacy.grid.tools.DateParser; 44 | import net.yacy.grid.tools.Digest; 45 | import net.yacy.grid.tools.JSONList; 46 | import net.yacy.grid.tools.Logger; 47 | import net.yacy.grid.tools.MultiProtocolURL; 48 | 49 | 50 | public class CrawlerListener extends AbstractBrokerListener implements BrokerListener { 51 | 52 | private final static String[] FIELDS_IN_GRAPH = new String[]{ 53 | WebMapping.inboundlinks_sxt.name(), 54 | WebMapping.outboundlinks_sxt.name(), 55 | //WebMapping.images_sxt.name(), 56 | WebMapping.frames_sxt.name(), 57 | WebMapping.iframes_sxt.name() 58 | }; 59 | 60 | private final static String PATTERN_TIMEF = "YYYYMMddHHmmssSSS"; 61 | 62 | public static int[] CRAWLER_PRIORITY_DIMENSIONS = YaCyServices.crawler.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.crawler.getSourceQueues().length - 1, 1}; 63 | private static int[] LOADER_PRIORITY_DIMENSIONS = YaCyServices.loader.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.loader.getSourceQueues().length - 1, 1}; 64 | private static int[] PARSER_PRIORITY_DIMENSIONS = YaCyServices.parser.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.parser.getSourceQueues().length - 1, 1}; 65 | private static int[] INDEXER_PRIORITY_DIMENSIONS = YaCyServices.indexer.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.indexer.getSourceQueues().length - 1, 1}; 66 | 67 | static void initPriorityQueue(final int priorityDimension) { 68 | CRAWLER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.crawler, priorityDimension); 69 | LOADER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.loader, priorityDimension); 70 | PARSER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.parser, priorityDimension); 71 | INDEXER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.indexer, priorityDimension); 72 | } 73 | 74 | private static int[] priorityDimensions(final YaCyServices service, final int d) { 75 | return service.getSourceQueues().length <= d ? new int[] {service.getSourceQueues().length, 0} : new int[] {service.getSourceQueues().length - d, d}; 76 | } 77 | 78 | private final String[] blacklist_crawler_names_list, blacklist_indexer_names_list; 79 | private final Map blacklists_crawler, blacklists_indexer; 80 | 81 | //private final static Map doubles = Service.hazelcast.getMap("doubles"); 82 | private final Map doubles = new ConcurrentHashMap<>(); 83 | private static long doublesLastCleanup = System.currentTimeMillis(); 84 | private final static long doublesCleanupTimeout = 1000L * 60L * 60L * 24L * 7L; // cleanup after 7 days 85 | private final static long doublesCleanupPeriod = 1000L * 60L * 10L; // do cleanup each 10 minutes 86 | private static class DoubleCache implements Serializable { 87 | private static final long serialVersionUID = 614262945713636851L; 88 | public Set doubleHashes; 89 | public long time; 90 | public DoubleCache() { 91 | this.time = System.currentTimeMillis(); 92 | this.doubleHashes = ConcurrentHashMap.newKeySet(); 93 | } 94 | } 95 | 96 | private void doDoubleCleanup() { 97 | final long now = System.currentTimeMillis(); 98 | if (now - doublesLastCleanup < doublesCleanupPeriod) return; 99 | doublesLastCleanup = now; 100 | final Iterator> i = this.doubles.entrySet().iterator(); 101 | while (i.hasNext()) { 102 | final Map.Entry cache = i.next(); 103 | if ((now - cache.getValue().time) > doublesCleanupTimeout) { 104 | cache.getValue().doubleHashes.clear(); 105 | i.remove(); 106 | } 107 | } 108 | } 109 | 110 | public static class CrawlstartURLSplitter { 111 | 112 | private final List crawlingURLArray; 113 | private final List badURLStrings; 114 | 115 | public CrawlstartURLSplitter(String crawlingURLsString) { 116 | Logger.info(this.getClass(), "splitting url list: " + crawlingURLsString); 117 | crawlingURLsString = crawlingURLsString.replaceAll("\\|http", "\nhttp").replaceAll("%7Chttp", "\nhttp").replaceAll("%0D%0A", "\n").replaceAll("%0A", "\n").replaceAll("%0D", "\n").replaceAll(" ", "\n"); 118 | final String[] crawlingURLs = crawlingURLsString.split("\n"); 119 | this.crawlingURLArray = new ArrayList<>(); 120 | this.badURLStrings = new ArrayList<>(); 121 | for (final String u: crawlingURLs) { 122 | try { 123 | final MultiProtocolURL url = new MultiProtocolURL(u); 124 | Logger.info(this.getClass(), "splitted url: " + url.toNormalform(true)); 125 | this.crawlingURLArray.add(url); 126 | } catch (final MalformedURLException e) { 127 | this.badURLStrings.add(u); 128 | Logger.warn(this.getClass(), "error when starting crawl with splitter url " + u + "; splitted from " + crawlingURLsString, e); 129 | } 130 | } 131 | } 132 | 133 | public List getURLs() { 134 | return this.crawlingURLArray; 135 | } 136 | 137 | public List getBadURLs() { 138 | return this.badURLStrings; 139 | } 140 | } 141 | 142 | public static String getCrawlID(final MultiProtocolURL url, final Date date, final int count) { 143 | String id = url.getHost(); 144 | if (id.length() > 80) id = id.substring(0, 80) + "-" + id.hashCode(); 145 | id = id + "-" + DateParser.secondDateFormat.format(date).replace(':', '-').replace(' ', '-') + "-" + count; 146 | return id; 147 | } 148 | 149 | public CrawlerListener(final Configuration config, final YaCyServices service) { 150 | super(config, service, Runtime.getRuntime().availableProcessors()); 151 | 152 | this.blacklist_crawler_names_list = config.properties.get("grid.crawler.blacklist").split(","); 153 | this.blacklist_indexer_names_list = config.properties.get("grid.indexer.blacklist").split(","); 154 | this.blacklists_crawler = new ConcurrentHashMap<>(); 155 | this.blacklists_indexer = new ConcurrentHashMap<>(); 156 | } 157 | 158 | private final Blacklist getBlacklistCrawler(final String processName, final int processNumber) { 159 | final String key = processName + "_" + processNumber; 160 | Blacklist blacklist = this.blacklists_crawler.get(key); 161 | if (blacklist == null) { 162 | this.blacklists_crawler.put(key, blacklist = loadBlacklist(this.blacklist_crawler_names_list)); 163 | } 164 | return blacklist; 165 | } 166 | 167 | private final Blacklist getBlacklistIndexer(final String processName, final int processNumber) { 168 | final String key = processName + "_" + processNumber; 169 | Blacklist blacklist = this.blacklists_indexer.get(key); 170 | if (blacklist == null) { 171 | this.blacklists_indexer.put(key, blacklist = loadBlacklist(this.blacklist_indexer_names_list)); 172 | } 173 | return blacklist; 174 | } 175 | 176 | private final Blacklist loadBlacklist(final String[] names) { 177 | final Blacklist blacklist = new Blacklist(); 178 | for (final String name: names) { 179 | File f = new File(super.config.gridServicePath, "conf/" + name.trim()); 180 | if (!f.exists()) f = new File("conf/" + name.trim()); 181 | if (!f.exists()) continue; 182 | try { 183 | blacklist.load(f); 184 | } catch (final IOException e) { 185 | Logger.warn(this.getClass(), e); 186 | } 187 | } 188 | return blacklist; 189 | } 190 | 191 | @Override 192 | public ActionResult processAction(final SusiAction crawlaction, final JSONArray data, final String processName, final int processNumber) { 193 | doDoubleCleanup(); 194 | final String crawl_id = crawlaction.getStringAttr("id"); 195 | String user_id = crawlaction.getStringAttr("user_id"); 196 | if (user_id == null || user_id.length() == 0) user_id = User.ANONYMOUS_ID; 197 | JSONArray user_ids = crawlaction.getArrayAttr("user_ids"); 198 | if (user_ids == null) user_ids = new JSONArray(); 199 | if (user_id != null && user_id.length() > 0 && !user_ids.toList().contains(user_id)) user_ids.put(user_id); 200 | 201 | if (crawl_id == null || crawl_id.length() == 0) { 202 | Logger.info("Crawler.processAction Fail: Action does not have an id: " + crawlaction.toString()); 203 | return ActionResult.FAIL_IRREVERSIBLE; 204 | } 205 | final JSONObject crawl = SusiThought.selectData(data, "id", crawl_id); 206 | if (crawl == null) { 207 | Logger.info(this.getClass(), "Crawler.processAction Fail: ID of Action not found in data: " + crawlaction.toString()); 208 | return ActionResult.FAIL_IRREVERSIBLE; 209 | } 210 | 211 | final boolean archiveWARC = crawl.optBoolean("archiveWARC"); 212 | final boolean archiveIndex = crawl.optBoolean("archiveIndex"); 213 | final boolean archiveGraph = crawl.optBoolean("archiveGraph"); 214 | 215 | final int depth = crawlaction.getIntAttr("depth"); 216 | final int crawlingDepth = crawl.getInt("crawlingDepth"); 217 | final int priority = crawl.has("priority") ? crawl.getInt("priority") : 0; 218 | // check depth (this check should be deprecated because we limit by omitting the crawl message at crawl tree leaves) 219 | if (depth > crawlingDepth) { 220 | // this is a leaf in the crawl tree (it does not mean that the crawl is finished) 221 | Logger.info(this.getClass(), "Crawler.processAction Leaf: reached a crawl leaf for crawl " + crawl_id + ", depth = " + crawlingDepth); 222 | return ActionResult.SUCCESS; 223 | } 224 | final boolean isCrawlLeaf = depth == crawlingDepth; 225 | 226 | // load graph 227 | final String sourcegraph = crawlaction.getStringAttr("sourcegraph"); 228 | if (sourcegraph == null || sourcegraph.length() == 0) { 229 | Logger.info(this.getClass(), "Crawler.processAction Fail: sourcegraph of Action is empty: " + crawlaction.toString()); 230 | return ActionResult.FAIL_IRREVERSIBLE; 231 | } 232 | try { 233 | JSONList jsonlist = null; 234 | if (crawlaction.hasAsset(sourcegraph)) { 235 | jsonlist = crawlaction.getJSONListAsset(sourcegraph); 236 | } 237 | if (jsonlist == null) try { 238 | final Asset graphasset = super.config.gridStorage.load(sourcegraph); // this must be a list of json, containing document links 239 | final byte[] graphassetbytes = graphasset.getPayload(); 240 | jsonlist = new JSONList(new ByteArrayInputStream(graphassetbytes)); 241 | } catch (final IOException e) { 242 | Logger.warn(this.getClass(), "Crawler.processAction could not read asset from storage: " + sourcegraph, e); 243 | return ActionResult.FAIL_IRREVERSIBLE; 244 | } 245 | 246 | // declare filter from the crawl profile 247 | final String mustmatchs = crawl.optString("mustmatch"); 248 | final Pattern mustmatch = Pattern.compile(mustmatchs); 249 | final String mustnotmatchs = crawl.optString("mustnotmatch"); 250 | final Pattern mustnotmatch = Pattern.compile(mustnotmatchs); 251 | // filter for indexing steering 252 | final String indexmustmatchs = crawl.optString("indexmustmatch"); 253 | final Pattern indexmustmatch = Pattern.compile(indexmustmatchs); 254 | final String indexmustnotmatchs = crawl.optString("indexmustnotmatch"); 255 | final Pattern indexmustnotmatch = Pattern.compile(indexmustnotmatchs); 256 | // attributes for new crawl entries 257 | final String collectionss = crawl.optString("collection"); 258 | final Map collections = WebMapping.collectionParser(collectionss); 259 | final String start_url = crawl.optString("start_url"); 260 | final String start_ssld = crawl.optString("start_ssld"); 261 | 262 | final Date now = new Date(); 263 | final long timestamp = now.getTime(); 264 | // For each of the parsed document, there is a target graph. 265 | // The graph contains all url elements which may appear in a document. 266 | // In the following loop we collect all urls which may be of interest for the next depth of the crawl. 267 | final Map nextMap = new HashMap<>(); // a map from urlid to url 268 | final Blacklist blacklist_crawler = getBlacklistCrawler(processName, processNumber); 269 | final List crawlerDocuments = new ArrayList<>(); 270 | graphloop: for (int line = 0; line < jsonlist.length(); line++) { 271 | final JSONObject json = jsonlist.get(line); 272 | if (json.has("index")) continue graphloop; // this is an elasticsearch index directive, we just skip that 273 | 274 | final String sourceurl = json.has(WebMapping.url_s.getMapping().name()) ? json.getString(WebMapping.url_s.getMapping().name()) : ""; 275 | final Set graph = new HashSet<>(); 276 | final String graphurl = json.has(WebMapping.canonical_s.name()) ? json.getString(WebMapping.canonical_s.name()) : null; 277 | if (graphurl != null) try { 278 | graph.add(new MultiProtocolURL(graphurl)); 279 | } catch (final MalformedURLException e) { 280 | Logger.warn(this.getClass(), "Crawler.processAction error when starting crawl with canonical url " + graphurl, e); 281 | } 282 | for (final String field: FIELDS_IN_GRAPH) { 283 | if (json.has(field)) { 284 | final JSONArray a = json.getJSONArray(field); 285 | urlloop: for (int i = 0; i < a.length(); i++) { 286 | final String u = a.getString(i); 287 | try { 288 | graph.add(new MultiProtocolURL(u)); 289 | } catch (final MalformedURLException e) { 290 | Logger.warn(this.getClass(), "Crawler.processAction we discovered a bad follow-up url: " + u, e); 291 | continue urlloop; 292 | } 293 | } 294 | } 295 | } 296 | 297 | // sort out doubles and apply filters 298 | DoubleCache doublecache = null; 299 | if (!this.doubles.containsKey(crawl_id)) this.doubles.put(crawl_id, new DoubleCache()); 300 | doublecache = this.doubles.get(crawl_id); 301 | Logger.info(this.getClass(), "Crawler.processAction processing sub-graph with " + graph.size() + " urls for url " + sourceurl); 302 | urlcheck: for (final MultiProtocolURL url: graph) { 303 | // prepare status document 304 | final ContentDomain cd = url.getContentDomainFromExt(); 305 | 306 | if (cd == ContentDomain.TEXT || cd == ContentDomain.ALL) { 307 | // check if the url shall be loaded using the constraints 308 | final String u = url.toNormalform(true); 309 | final String urlid = Digest.encodeMD5Hex(u); 310 | 311 | // double check with the fast double cache 312 | if (doublecache.doubleHashes.contains(urlid)) { 313 | continue urlcheck; 314 | } 315 | doublecache.doubleHashes.add(urlid); 316 | 317 | // create new crawl status document 318 | final CrawlerDocument crawlStatus = new CrawlerDocument() 319 | .setCrawlID(crawl_id) 320 | .setUserlID(user_id) 321 | .setMustmatch(mustmatchs) 322 | .setCollections(collections.keySet()) 323 | .setCrawlstartURL(start_url) 324 | .setCrawlstartSSLD(start_ssld) 325 | .setInitDate(now) 326 | .setStatusDate(now) 327 | .setURL(u); 328 | 329 | // check matcher rules 330 | if (!mustmatch.matcher(u).matches() || mustnotmatch.matcher(u).matches()) { 331 | crawlStatus 332 | .setStatus(Status.rejected) 333 | .setComment(!mustmatch.matcher(u).matches() ? "url does not match must-match filter " + mustmatchs : "url matches mustnotmatch filter " + mustnotmatchs); 334 | crawlerDocuments.add(crawlStatus); 335 | continue urlcheck; 336 | } 337 | 338 | // check blacklist (this is costly because the blacklist is huge) 339 | final Blacklist.BlacklistInfo blacklistInfo = blacklist_crawler.isBlacklisted(u, url); 340 | if (blacklistInfo != null) { 341 | Logger.info(this.getClass(), "Crawler.processAction crawler blacklist pattern '" + blacklistInfo.matcher.pattern().toString() + "' removed url '" + u + "' from crawl list " + blacklistInfo.source + ": " + blacklistInfo.info); 342 | crawlStatus 343 | .setStatus(Status.rejected) 344 | .setComment("url matches blacklist"); 345 | crawlerDocuments.add(crawlStatus); 346 | continue urlcheck; 347 | } 348 | 349 | // double check with the elastic index (we do this late here because it is the most costly operation) 350 | //if (config.gridIndex.exist(GridIndex.CRAWLER_INDEX_NAME, GridIndex.EVENT_TYPE_NAME, urlid)) { 351 | // continue urlcheck; 352 | //} 353 | 354 | // add url to next stack 355 | nextMap.put(urlid, u); 356 | } 357 | }; 358 | } 359 | 360 | if (!nextMap.isEmpty()) { 361 | 362 | // make a double-check 363 | final String crawlerIndexName = super.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER); 364 | final Set exist = super.config.gridIndex.existBulk(crawlerIndexName, nextMap.keySet()); 365 | for (final String u: exist) nextMap.remove(u); 366 | final Collection nextList = nextMap.values(); // a set of urls 367 | 368 | // divide the nextList into two sub-lists, one which will reach the indexer and another one which will not cause indexing 369 | @SuppressWarnings("unchecked") 370 | final 371 | List[] indexNoIndex = new List[2]; 372 | indexNoIndex[0] = new ArrayList<>(); // for: index 373 | indexNoIndex[1] = new ArrayList<>(); // for: no-Index 374 | final Blacklist blacklist_indexer = getBlacklistIndexer(processName, processNumber); 375 | nextList.forEach(url -> { 376 | final boolean indexConstratntFromCrawlProfil = indexmustmatch.matcher(url).matches() && !indexmustnotmatch.matcher(url).matches(); 377 | final Blacklist.BlacklistInfo blacklistInfo = blacklist_indexer.isBlacklisted(url, null); 378 | final boolean indexConstraintFromBlacklist = blacklistInfo == null; 379 | if (indexConstratntFromCrawlProfil && indexConstraintFromBlacklist) { 380 | indexNoIndex[0].add(url); 381 | } else { 382 | indexNoIndex[1].add(url); 383 | } 384 | }); 385 | 386 | for (int ini = 0; ini < 2; ini++) { 387 | 388 | // create crawler index entries 389 | for (final String u: indexNoIndex[ini]) { 390 | final CrawlerDocument crawlStatus = new CrawlerDocument() 391 | .setCrawlID(crawl_id) 392 | .setUserlID(user_id) 393 | .setMustmatch(mustmatchs) 394 | .setCollections(collections.keySet()) 395 | .setCrawlstartURL(start_url) 396 | .setCrawlstartSSLD(start_ssld) 397 | .setInitDate(now) 398 | .setStatusDate(now) 399 | .setStatus(Status.accepted) 400 | .setURL(u) 401 | .setComment(ini == 0 ? "to be indexed" : "noindex, just for crawling"); 402 | crawlerDocuments.add(crawlStatus); 403 | } 404 | 405 | // create partitions 406 | final List partitions = createPartition(indexNoIndex[ini], 8); 407 | 408 | // create follow-up crawl to next depth 409 | for (int pc = 0; pc < partitions.size(); pc++) { 410 | final JSONObject loaderAction = newLoaderAction( 411 | priority, crawl_id, user_id, user_ids, partitions.get(pc), depth, isCrawlLeaf, 412 | 0, timestamp + ini, pc, depth < crawlingDepth, ini == 0, 413 | archiveWARC, archiveIndex, archiveGraph); // action includes whole hierarchy of follow-up actions 414 | final SusiThought nextjson = new SusiThought() 415 | .setData(data) 416 | .addAction(new SusiAction(loaderAction)); 417 | 418 | // put a loader message on the queue 419 | final String message = nextjson.toString(2); 420 | final byte[] b = message.getBytes(StandardCharsets.UTF_8); 421 | try { 422 | final Services serviceName = YaCyServices.valueOf(loaderAction.getString("type")); 423 | final GridQueue queueName = new GridQueue(loaderAction.getString("queue")); 424 | super.config.gridBroker.send(serviceName, queueName, b); 425 | } catch (final IOException e) { 426 | Logger.warn(this.getClass(), "error when starting crawl with message " + message, e); 427 | } 428 | }; 429 | } 430 | } 431 | // bulk-store the crawler documents 432 | final Map crawlerDocumentsMap = new HashMap<>(); 433 | crawlerDocuments.forEach(crawlerDocument -> { 434 | final String url = crawlerDocument.getURL(); 435 | if (url != null && url.length() > 0) { 436 | final String id = Digest.encodeMD5Hex(url); 437 | crawlerDocumentsMap.put(id, crawlerDocument); 438 | } else { 439 | assert false : "url not set / storeBulk"; 440 | } 441 | }); 442 | CrawlerDocument.storeBulk(super.config, super.config.gridIndex, crawlerDocumentsMap); 443 | Logger.info(this.getClass(), "Crawler.processAction processed graph with " + jsonlist.length()/2 + " subgraphs from " + sourcegraph); 444 | return ActionResult.SUCCESS; 445 | } catch (final Throwable e) { 446 | Logger.warn(this.getClass(), "Crawler.processAction Fail: loading of sourcegraph failed: " + e.getMessage() /*+ "\n" + crawlaction.toString()*/, e); 447 | return ActionResult.FAIL_IRREVERSIBLE; 448 | } 449 | } 450 | 451 | private static List createPartition(final Collection urls, final int partitionSize) { 452 | final List partitions = new ArrayList<>(); 453 | urls.forEach(url -> { 454 | int c = partitions.size(); 455 | if (c == 0 || partitions.get(c - 1).length() >= partitionSize) { 456 | partitions.add(new JSONArray()); 457 | c++; 458 | } 459 | partitions.get(c - 1).put(url); 460 | }); 461 | return partitions; 462 | } 463 | 464 | /** 465 | * Create a new loader action. This action contains all follow-up actions after 466 | * loading to create a steering of parser, indexing and follow-up crawler actions. 467 | * @param priority the prioroty of the crawl 468 | * @param id the crawl id 469 | * @param user_id the id of the user (9 digit number) 470 | * @param user_ids all users which have that domin as crawl assigned 471 | * @param urls the urls which are part of the same actions 472 | * @param depth the depth of the crawl step (0 is start depth) 473 | * @param retry the number of load re-tries (0 is no retry, shows that this is the first attempt) 474 | * @param timestamp the current time when the crawler created the action 475 | * @param partition unique number of the url set partition. This is used to create asset names. 476 | * @param doCrawling flag: if true, create a follow-up crawling action. set this to false to terminate crawling afterwards 477 | * @param doIndexing flag: if true, do an indexing after loading. set this to false if the purpose is only a follow-up crawl after parsing 478 | * @return the action json 479 | * @throws IOException 480 | */ 481 | private JSONObject newLoaderAction( 482 | final int priority, 483 | final String id, 484 | final String user_id, 485 | final JSONArray user_ids, 486 | final JSONArray urls, 487 | final int depth, 488 | final boolean isCrawlLeaf, 489 | final int retry, 490 | final long timestamp, 491 | final int partition, 492 | final boolean doCrawling, 493 | final boolean doIndexing, 494 | final boolean archiveWARC, 495 | final boolean archiveIndex, 496 | final boolean archiveGraph) throws IOException { 497 | // create file names for the assets: this uses depth and partition information 498 | final SimpleDateFormat FORMAT_TIMEF = new SimpleDateFormat(PATTERN_TIMEF, Locale.US); // we must create this here to prevent concurrency bugs which are there in the date formatter :(( 499 | final String basepath = "/data/aaaaa/accounting/" + user_id + "/"; 500 | final String docname = "d" + intf(depth, 2) + "-t" + FORMAT_TIMEF.format(new Date(timestamp)) + "-p" + intf(partition, 4); 501 | final String warcasset = basepath + "warc/" + id + "/" + docname + ".warc.gz"; 502 | final String indexasset = basepath + "index/" + id + "/" + docname + ".index.jsonlist"; 503 | final String graphasset = basepath + "graph/" + id + "/" + docname + ".graph.jsonlist"; 504 | final String hashKey = new MultiProtocolURL(urls.getString(0)).getHost(); 505 | 506 | // create actions to be done in reverse order: 507 | // at the end of the processing we simultaneously place actions on the indexing and crawling queue 508 | final JSONArray postParserActions = new JSONArray(); 509 | assert doIndexing || doCrawling; // one or both must be true; doing none of that does not make sense 510 | // if all of the urls shall be indexed (see indexing patterns) then do indexing actions 511 | if (doIndexing) { 512 | final GridQueue indexerQueueName = super.config.gridBroker.queueName(YaCyServices.indexer, YaCyServices.indexer.getSourceQueues(), ShardingMethod.LEAST_FILLED, INDEXER_PRIORITY_DIMENSIONS, priority, hashKey); 513 | postParserActions.put(new JSONObject(true) 514 | .put("type", YaCyServices.indexer.name()) 515 | .put("queue", indexerQueueName.name()) 516 | .put("id", id) 517 | .put("user_id", user_id) 518 | .put("user_ids", user_ids) 519 | .put("sourceasset", indexasset) 520 | .put("archiveindex", archiveIndex) 521 | ); 522 | } 523 | // if all of the urls shall be crawled at depth + 1, add a crawling action. Don't do this only if the crawling depth is at the depth limit. 524 | if (doCrawling) { 525 | final GridQueue crawlerQueueName = super.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.LEAST_FILLED, CRAWLER_PRIORITY_DIMENSIONS, priority, hashKey); 526 | postParserActions.put(new JSONObject(true) 527 | .put("type", YaCyServices.crawler.name()) 528 | .put("queue", crawlerQueueName.name()) 529 | .put("id", id) 530 | .put("user_id", user_id) 531 | .put("user_ids", user_ids) 532 | .put("depth", depth + 1) 533 | .put("sourcegraph", graphasset) 534 | .put("archivegraph", archiveGraph) 535 | ); 536 | } 537 | 538 | // before that and after loading we have a parsing action 539 | final GridQueue parserQueueName = super.config.gridBroker.queueName(YaCyServices.parser, YaCyServices.parser.getSourceQueues(), ShardingMethod.LEAST_FILLED, PARSER_PRIORITY_DIMENSIONS, priority, hashKey); 540 | final JSONArray parserActions = new JSONArray().put(new JSONObject(true) 541 | .put("type", YaCyServices.parser.name()) 542 | .put("queue", parserQueueName.name()) 543 | .put("id", id) 544 | .put("user_id", user_id) 545 | .put("user_ids", user_ids) 546 | .put("sourceasset", warcasset) 547 | .put("targetasset", indexasset) 548 | .put("targetgraph", graphasset) 549 | .put("archivewarc", archiveWARC) 550 | .put("archiveindex", archiveIndex) 551 | .put("archivegraph", archiveGraph) 552 | .put("actions", postParserActions)); // actions after parsing 553 | 554 | // at the beginning of the process, we do a loading. 555 | final GridQueue loaderQueueName = super.config.gridBroker.queueName(YaCyServices.loader, YaCyServices.loader.getSourceQueues(), isCrawlLeaf ? ShardingMethod.LEAST_FILLED : ShardingMethod.BALANCE, LOADER_PRIORITY_DIMENSIONS, priority, hashKey); 556 | final JSONObject loaderAction = new JSONObject(true) 557 | .put("type", YaCyServices.loader.name()) 558 | .put("queue", loaderQueueName.name()) 559 | .put("id", id) 560 | .put("user_id", user_id) 561 | .put("user_ids", user_ids) 562 | .put("urls", urls) 563 | .put("targetasset", warcasset) 564 | .put("archivewarc", archiveWARC) 565 | .put("actions", parserActions); // actions after loading 566 | return loaderAction; 567 | } 568 | 569 | private final static String intf(final int i, final int len) { 570 | String s = Integer.toString(i); 571 | while (s.length() < len) s = '0' + s; 572 | return s; 573 | } 574 | 575 | @Override 576 | public Telemetry getTelemetry() { 577 | return null; 578 | } 579 | } 580 | --------------------------------------------------------------------------------