├── .github
    └── FUNDING.yml
├── bin
    ├── restart.sh
    ├── start.sh
    ├── stop.sh
    ├── crawlstart.py
    └── start_crawler_docker.sh
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── .dockerignore
├── .gitmodules
├── .gitignore
├── .settings
    ├── org.eclipse.jdt.core.prefs
    └── org.eclipse.buildship.core.prefs
├── conf
    ├── indexer_blacklist_filetypes.txt
    ├── crawler_blacklist_localhost.txt
    └── config.properties
├── src
    └── main
    │   ├── resources
    │       └── log4j.properties
    │   └── java
    │       └── net
    │           └── yacy
    │               └── grid
    │                   └── crawler
    │                       ├── api
    │                           ├── CrawlerDefaultValuesService.java
    │                           └── CrawlStartService.java
    │                       ├── Blacklist.java
    │                       ├── Crawler.java
    │                       └── CrawlerListener.java
├── .project
├── Dockerfile
├── .classpath
├── gradlew.bat
├── README.md
└── gradlew


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: orbiter
2 | patreon: 0rb1t3r
3 | 


--------------------------------------------------------------------------------
/bin/restart.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | cd "`dirname $0`"
3 | ./stop.sh
4 | sleep 1
5 | ./start.sh
6 | 
7 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yacy/yacy_grid_crawler/HEAD/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .gitignore
 3 | data
 4 | build
 5 | bin
 6 | docker
 7 | Dockerfile
 8 | LICENSE.md
 9 | README.md
10 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "submodules/yacy_grid_mcp"]
2 | 	path = submodules/yacy_grid_mcp
3 | 	url = https://github.com/yacy/yacy_grid_mcp.git
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | classes/
 2 | target/
 3 | data/
 4 | /class/
 5 | /.gradle/
 6 | /build/
 7 | .DS_Store
 8 | .settings
 9 | .idea/
10 | bin/ai/
11 | bin/log4j.properties
12 | bin/net/
13 | bin/org/


--------------------------------------------------------------------------------
/bin/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | cd "`dirname $0`"
3 | cd ..
4 | nohup java -jar build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar < /dev/null &
5 | sleep 1
6 | echo "YaCy Grid Crawler started!"
7 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
3 | org.eclipse.jdt.core.compiler.compliance=1.8
4 | org.eclipse.jdt.core.compiler.source=1.8
5 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/conf/indexer_blacklist_filetypes.txt:
--------------------------------------------------------------------------------
1 | # Indexing Blacklist for bad file types
2 | 
3 | .*?\.xml         # Reject XML in search index
4 | .*?\.css         # Reject CSS in search index
5 | .*?\.js          # Reject JavaScript in search index
6 | .*?/robots\.txt  # Reject robots.txt


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=A1
 3 | 
 4 | # A1 is set to be a ConsoleAppender.
 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # A1 uses PatternLayout.
 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
10 | 
11 | log4j.logger.org.eclipse.jetty = INFO
12 | log4j.logger.org.apache.http = INFO
13 | 


--------------------------------------------------------------------------------
/conf/crawler_blacklist_localhost.txt:
--------------------------------------------------------------------------------
 1 | # Blacklist for local, private or intranet URLs
 2 | 
 3 | .*?//localhost.*+                      # Localhost host name
 4 | .*?//127\..*+                          # Localhost IPv4
 5 | .*?//10\..*+                           # Private IPv4 Class A Network 10.x.x.x
 6 | .*?//172\.(1[6-9]|2[0-9]|3[0-1])\..*+  # Private IPv4 Class B Network 172.16.0.0 .. 172.31.255.255
 7 | .*?//192\.168\..*+                     # Private IPv4 Class C Network 192.168.0.0 .. 192.168.255.255
 8 | .*?//^::1.*+                           # Localhost IPv6
 9 | .*?//[fF][cCdD].*+                     # IPv6 User Local Address Space
10 | 


--------------------------------------------------------------------------------
/bin/stop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | cd "`dirname $0`"
 3 | cd ../data
 4 | KILLFILE="crawler-8300.kill"
 5 | PIDFILE="crawler-8300.pid"
 6 | 
 7 | # first method to terminate the process
 8 | if [ -f "$KILLFILE" ];
 9 | then
10 |    rm $KILLFILE
11 |    echo "termination requested, waiting.."
12 |    # this can take 10 seconds..
13 |    sleep 10
14 | fi
15 | 
16 | # second method to terminate the process
17 | if [ -f "$PIDFILE" ];
18 | then
19 |    fuser -k $PIDFILE
20 | fi
21 | 
22 | # check if file does not exist any more which would be a sign that this has terminated
23 | if [ ! -f "$PIDFILE" ];
24 | then
25 |    echo "process terminated"
26 | fi
27 | 
28 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.buildship.core.prefs:
--------------------------------------------------------------------------------
 1 | arguments=
 2 | auto.sync=false
 3 | build.commands=org.eclipse.jdt.core.javabuilder
 4 | build.scans.enabled=false
 5 | connection.arguments=
 6 | connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(5.6.2))
 7 | connection.java.home=null
 8 | connection.jvm.arguments=
 9 | connection.project.dir=
10 | derived.resources=.gradle,build
11 | eclipse.preferences.version=1
12 | gradle.user.home=
13 | java.home=
14 | jvm.arguments=
15 | natures=org.eclipse.jdt.core.javanature
16 | offline.mode=false
17 | override.workspace.settings=true
18 | project.path=\:
19 | show.console.view=true
20 | show.executions.view=true
21 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>yacy_grid_crawler</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.buildship.core.gradleprojectbuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.buildship.core.gradleprojectnature</nature>
21 | 		<nature>org.eclipse.jdt.core.javanature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ## yacy_grid_crawler dockerfile
 2 | ## examples:
 3 | # docker build -t yacy_grid_crawler .
 4 | # docker run -d --rm -p 8300:8300 --name yacy_grid_crawler yacy_grid_crawler
 5 | ## Check if the service is running:
 6 | # curl http://localhost:8300/yacy/grid/mcp/info/status.json
 7 | 
 8 | # build app
 9 | FROM eclipse-temurin:8-jdk-focal AS appbuilder
10 | COPY ./ /app
11 | WORKDIR /app
12 | RUN ./gradlew clean shadowDistTar
13 | 
14 | # build dist
15 | FROM eclipse-temurin:8-jre-focal
16 | LABEL maintainer="Michael Peter Christen <mc@yacy.net>"
17 | ENV DEBIAN_FRONTEND noninteractive
18 | ARG default_branch=master
19 | COPY ./conf /app/conf/
20 | COPY --from=appbuilder /app/build/libs/ ./app/build/libs/
21 | WORKDIR /app
22 | EXPOSE 8300
23 | 
24 | # for some weird reason the jar file is sometimes not named correctly
25 | RUN if [ -e /app/build/libs/app-0.0.1-SNAPSHOT-all.jar ] ; then mv /app/build/libs/app-0.0.1-SNAPSHOT-all.jar /app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar; fi
26 | 
27 | CMD ["java", "-Xms320M", "-Xmx2G", "-jar", "/app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar"]
28 | 


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="bin/main" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="gradle_scope" value="main"/>
 6 | 			<attribute name="gradle_used_by_scope" value="main,test"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="src" output="bin/main" path="submodules/yacy_grid_mcp/src/main/java">
10 | 		<attributes>
11 | 			<attribute name="gradle_scope" value="main"/>
12 | 			<attribute name="gradle_used_by_scope" value="main,test"/>
13 | 		</attributes>
14 | 	</classpathentry>
15 | 	<classpathentry kind="src" output="bin/main" path="src/main/resources">
16 | 		<attributes>
17 | 			<attribute name="gradle_scope" value="main"/>
18 | 			<attribute name="gradle_used_by_scope" value="main,test"/>
19 | 		</attributes>
20 | 	</classpathentry>
21 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
22 | 	<classpathentry kind="con" path="org.eclipse.buildship.core.gradleclasspathcontainer"/>
23 | 	<classpathentry kind="output" path="bin/default"/>
24 | </classpath>
25 | 


--------------------------------------------------------------------------------
/bin/crawlstart.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | 
 3 | import sys
 4 | import requests
 5 | import urllib.parse
 6 | 
 7 | crawlingURL = sys.argv[1]
 8 | parsed_url = urllib.parse.urlparse(crawlingURL)
 9 | crawlingHost = parsed_url.netloc
10 | crawlingProtocol = parsed_url.scheme
11 | 
12 | data = {
13 |     'cachePolicy': 'iffresh',
14 |     'collection': 'testcollection',
15 |     'crawlingstart': 'Start crawling',
16 |     'crawlingMode': 'url',
17 |     'crawlingQ': 'on',
18 |     'crawlingDepth': 1,
19 |     'crawlingDepthExtension': '',
20 |     'crawlingURL': crawlingURL,
21 |     'deleteIfOlderNumber': 1,
22 |     'deleteIfOlderUnit': 'day',
23 |     'deleteold': 'age',
24 |     'indexmustmatch': '^{0}.*'.format(crawlingURL),
25 |     'indexmustnotmatch':  '',
26 |     'indexMedia': 'on',
27 |     'mustmatch': '^{protocol}://{host}/.*'.format(protocol=crawlingProtocol, host=crawlingHost),
28 |     'mustnotmatch': '',
29 |     'indexText': 'on',
30 |     'range': 'wide',
31 |     'recrawl': 'reload',
32 |     'reloadIfOlderNumber': 0,
33 |     'reloadIfOlderUnit': 'day',
34 |     'storeHTCache': 'on',
35 |     'xsstopw': 'on',
36 |     'priority': 0
37 | }
38 | 
39 | res = requests.get('http://localhost:8300/yacy/grid/crawler/crawlStart.json', params=data)
40 | 
41 | if res.status_code != 200:
42 |     print("ERR :: error starting the crawler")
43 |     print(res.text)
44 | else:
45 |     print("INF :: successfully sent '{0}' to crawler".format(crawlingURL))
46 | 


--------------------------------------------------------------------------------
/bin/start_crawler_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "`dirname $0`"
 3 | 
 4 | bindhost="127.0.0.1"
 5 | callhost="localhost"
 6 | appname="YaCy Grid Crawler"
 7 | containername=yacy-grid-crawler
 8 | imagename=${containername//-/_}
 9 | dockerfile="Dockerfile"
10 | production=false
11 | open=false
12 | 
13 | usage() { echo "usage: $0 [-o | --open | -p | --production | --arm32 | --arm64 ]" 1>&2; exit 1; }
14 | 
15 | while [[ $# -gt 0 ]]; do
16 |   case "$1" in
17 |     -p | --production ) production=true; shift 1;;
18 |     -o | --open ) open=true; shift 1;;
19 |     --arm32 ) imagename=${imagename}:arm32; dockerfile=${dockerfile}_arm32; shift 1;;
20 |     --arm64 ) imagename=${imagename}:arm64; dockerfile=${dockerfile}_arm64; shift 1;;
21 |     -h | --help | -* | --* | * ) usage;;
22 |   esac
23 | done
24 | if [ "$production" = true ] ; then imagename="yacy/${imagename}"; fi
25 | if [ "$open" = true ] ; then bindhost="0.0.0.0"; callhost=`hostname`; fi
26 | 
27 | containerRuns=$(docker ps | grep -i "${containername}" | wc -l ) 
28 | containerExists=$(docker ps -a | grep -i "${containername}" | wc -l ) 
29 | if [ ${containerRuns} -gt 0 ]; then
30 |   echo "${appname} container is already running"
31 | elif [ ${containerExists} -gt 0 ]; then
32 |   docker start ${containername}
33 |   echo "${appname} container re-started"
34 | else
35 |   if [[ $imagename != "yacy/"*":latest" ]] && [[ "$(docker images -q ${imagename} 2> /dev/null)" == "" ]]; then
36 |       cd ..
37 |       docker build -t ${imagename} -f ${dockerfile} .
38 |       cd bin
39 |   fi
40 |   docker run -d --restart=unless-stopped -p ${bindhost}:8300:8300 \
41 | 	 --link yacy-grid-minio --link yacy-grid-rabbitmq --link yacy-grid-elasticsearch --link yacy-grid-mcp \
42 | 	 -e YACYGRID_GRID_MCP_ADDRESS=yacy-grid-mcp \
43 | 	 --name ${containername} ${imagename}
44 |   echo "${appname} started."
45 | fi
46 | docker ps -a --format "table {{.ID}}\t{{.Image}}\t{{.Names}}\t{{.Mounts}}\t{{.Ports}}"
47 | 
48 | echo "To get the app status, open http://${callhost}:8300/yacy/grid/mcp/info/status.json"
49 | 


--------------------------------------------------------------------------------
/conf/config.properties:
--------------------------------------------------------------------------------
 1 | port = 8300
 2 | grid.mcp.address = 127.0.0.1:8100,node00.local:8100,brain.local:8100,searchlab.eu:8100
 3 | grid.broker.lazy = true
 4 | grid.broker.queue.limit = 0
 5 | grid.broker.queue.throttling = 100000
 6 | grid.assets.delete = true
 7 | 
 8 | # The blacklist is choosen with the attribute grid.crawler.blacklist which gives the file name(s) of the blacklist(s) to be used.
 9 | # To use your own blacklist, create a file in data/crawler-8300/conf/ and set the name of it
10 | # in the attribute grid.crawler.blacklist.
11 | #
12 | # You can use several blacklists simultanously, just comma-separate the names of the file names.
13 | # all files in the path conf/ and data/crawler-8300/conf/ are found.
14 | # The same applies to files in parallel processes like data/crawler-8301/conf/ and so on.
15 | #
16 | # The file format of the blacklist is:
17 | # - it is a plain text file in UTF-8 encoding
18 | # - every line beginning with '#' is a comment and is ignored
19 | # - every string, matching with ' #.*' is removed. This cuts away comments from the end of a line.
20 | # - every blank line is ignored
21 | # - every other line must contain a regular expression according to
22 | #   https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
23 | #   which is considered as a matcher pattern (not a find pattern) for an URL.
24 | #   Lines containing a regular expression get their leading and trailing spaces removed.
25 | # 
26 | # All regular expressions are considered to be a disjunction (OR logic) for the filtering of cralwing urls.
27 | # URLs are normalized before a matching is attempted, that means they are encoded propery
28 | # and the fragment identifier is removed from the end of the URL.
29 | grid.crawler.blacklist = crawler_blacklist_someonewhocares.txt,crawler_blacklist_localhost.txt
30 | grid.indexer.blacklist = indexer_blacklist_filetypes.txt
31 | grid.indexer.priorityQueues = 2
32 | 
33 | 
34 | 
35 | ####################################################################
36 | ## The following properties must be identical to those in the MCP ##
37 | ####################################################################
38 | 
39 | # The grid name is used to separate different grid networks.
40 | # Only networks with the same name connect with each other
41 | grid.name = freeworld
42 | 
43 | # Index names of the grid indexes:
44 | # crawlstart : a history of all crawl starts 
45 | # crawler    : tracking of crawling progress
46 | # query      : a history of all queries
47 | # web        : the document search index ("web index", there)
48 | grid.elasticsearch.indexName.crawlstart = crawlstart
49 | grid.elasticsearch.indexName.crawler = crawler
50 | grid.elasticsearch.indexName.query = query
51 | grid.elasticsearch.indexName.web = web
52 | 
53 | # the following type name is an intermediate solution to migrate from elastic 6.x to 8.x
54 | # unfortunately the current index type name is 'web' but in future elastic versions the name '_doc'
55 | # is mandatory. We will use this setting until migration to elastic 8.x is complete and delete
56 | # the configuration afterwards.
57 | grid.elasticsearch.typeName = web


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/api/CrawlerDefaultValuesService.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  CrawlerDefaultValuesService
 3 |  *  Copyright 04.6.2017 by Michael Peter Christen, @0rb1t3r
 4 |  *
 5 |  *  This library is free software; you can redistribute it and/or
 6 |  *  modify it under the terms of the GNU Lesser General Public
 7 |  *  License as published by the Free Software Foundation; either
 8 |  *  version 2.1 of the License, or (at your option) any later version.
 9 |  *
10 |  *  This library is distributed in the hope that it will be useful,
11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 |  *  Lesser General Public License for more details.
14 |  *
15 |  *  You should have received a copy of the GNU Lesser General Public License
16 |  *  along with this program in the file lgpl21.txt
17 |  *  If not, see <http://www.gnu.org/licenses/>.
18 |  */
19 | 
20 | package net.yacy.grid.crawler.api;
21 | 
22 | import javax.servlet.http.HttpServletResponse;
23 | 
24 | import org.json.JSONObject;
25 | 
26 | import net.yacy.grid.contracts.User;
27 | import net.yacy.grid.http.APIHandler;
28 | import net.yacy.grid.http.ObjectAPIHandler;
29 | import net.yacy.grid.http.Query;
30 | import net.yacy.grid.http.ServiceResponse;
31 | 
32 | /**
33 |  *
34 |  * Test URL:
35 |  * http://localhost:8300/yacy/grid/crawler/defaultValues.json
36 |  *
37 |  * Test command:
38 |  * curl http://localhost:8300/yacy/grid/crawler/defaultValues.json
39 |  */
40 | public class CrawlerDefaultValuesService extends ObjectAPIHandler implements APIHandler {
41 | 
42 |     private static final long serialVersionUID = 8578474303031749879L;
43 |     public static final String NAME = "defaultValues";
44 | 
45 |     public static JSONObject defaultValues = new JSONObject(true);
46 |     static {
47 |         defaultValues.put("crawlingMode", "url");
48 |         defaultValues.put("crawlingURL", "");
49 |         defaultValues.put("sitemapURL", "");
50 |         defaultValues.put("crawlingFile", "");
51 |         defaultValues.put("crawlingDepth", 3);
52 |         defaultValues.put("crawlingDepthExtension", "");
53 |         defaultValues.put("range", "domain");
54 |         defaultValues.put("mustmatch", ".*");
55 |         defaultValues.put("mustnotmatch", ".*\\.(js|css|jpg|jpeg|png|dmg|mpg|mpeg|zip|gz|exe|pkg)");
56 |         defaultValues.put("ipMustmatch", ".*");
57 |         defaultValues.put("ipMustnotmatch", "");
58 |         defaultValues.put("indexmustmatch", ".*");
59 |         defaultValues.put("indexmustnotmatch", "");
60 |         defaultValues.put("deleteold", "off");
61 |         defaultValues.put("deleteIfOlderNumber", 0);
62 |         defaultValues.put("deleteIfOlderUnit", "day");
63 |         defaultValues.put("recrawl", "nodoubles");
64 |         defaultValues.put("reloadIfOlderNumber", 0);
65 |         defaultValues.put("reloadIfOlderUnit", "day");
66 |         defaultValues.put("crawlingDomMaxCheck", "off");
67 |         defaultValues.put("crawlingDomMaxPages", 1000);
68 |         defaultValues.put("crawlingQ", "off");
69 |         defaultValues.put("cachePolicy", "if fresh");
70 |         defaultValues.put("collection", "user"); // corpus name
71 |         defaultValues.put("agentName", "");
72 |         defaultValues.put("priority", 0);
73 |         defaultValues.put("loaderHeadless", "false");
74 |         defaultValues.put("user_id", User.ANONYMOUS_ID);
75 |         defaultValues.put("storeAssets", "false");
76 |         defaultValues.put("archiveWARC", "false");
77 |         defaultValues.put("archiveIndex", "false");
78 |         defaultValues.put("archiveGraph", "false");
79 |     }
80 | 
81 |     @Override
82 |     public String getAPIPath() {
83 |         return "/yacy/grid/crawler/" + NAME + ".json";
84 |     }
85 | 
86 |     public static JSONObject crawlStartDefaultClone() {
87 |     	final JSONObject json = new JSONObject(true);
88 |     	defaultValues.keySet().forEach(key -> json.put(key, defaultValues.get(key)));
89 |         return json;
90 |     }
91 | 
92 |     @Override
93 |     public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) {
94 |         return new ServiceResponse(defaultValues);
95 |     }
96 | 
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YaCy Grid Component: Crawler
 2 | 
 3 | The YaCy Grid is the second-generation implementation of YaCy, a peer-to-peer search engine.
 4 | A YaCy Grid installation consists of a set of micro-services which communicate with each other
 5 | using the MCP, see https://github.com/yacy/yacy_grid_mcp
 6 | 
 7 | ## Purpose
 8 | 
 9 | The Crawler is a microservices which can be deployed i.e. using Docker. When the Crawler Component
10 | is started, it searches for a MCP and connect to it. By default the local host is searched for a
11 | MCP but you can configure one yourself.
12 | 
13 | ## What it does
14 | 
15 | The Crawler then does the following:
16 | 
17 | ```
18 | while (a Crawl Contract is in the queue crawler_pending) do
19 |    - read the target url from the contract
20 |    - check against the search index if the url is registered in the transaction index as 'to-be-parsed'. If not, continue
21 |    - load the url content from the assets (it must have been loaded before! - that is another process)
22 |    - parse the content and create a YaCy JSON object with that content
23 |    - place the YaCy JSON within a contract in the index_pending queue
24 |    - extract all links from the YaCy JSON
25 |    - check the validity of the links using the crawl contract
26 |    - all remaining urls are checked against the transaction index, all existing urls are discarded
27 |    - write an index entry for the remaining urls with status 'to-be-loaded'
28 |    - and these remaining urls are placed onto the loader_pending queue
29 |    - the status of the target url is set to to-be-indexed
30 | od
31 | ```
32 | ## Required Infrastructure (Search Index, Asset Storage and Message Queues)
33 | 
34 | This requires an transaction index with the following information:
35 | * `URL` (as defined with https://tools.ietf.org/html/rfc3986)
36 | * `crawlid` (a hash)
37 | * status (`to-be-loaded`, `to-be-parsed`, `to-be-indexed`, `indexed`)
38 | As long as a crawl process is running, new urls (as discovered in the html source of a target url)
39 | must be written to the transaction index before the target url has a status change (from to-be-parsed to to-be-indexed).
40 | This makes it possible that the status of a crawl job and the fact that it has been terminted can be
41 | discovered from the transaction index.
42 | * if all status entries for a single `crawlid` are `indexed` then the crawl has been terminated.
43 | The Crawl process needs another database index, which contains the crawl description. The content must be almost the same as
44 | describe in http://www.yacy-websuche.de/wiki/index.php/Dev:APICrawler
45 | 
46 | Every loader and parser microservice must read this crawl profile information. Because that information is required
47 | many times, we omit a request into the cawler index by adding the crawler profile into each contract of a crawl job in the
48 | crawler_pending and loader_pending queue.
49 | 
50 | The crawl is therefore controlled by those queues:
51 | * `loader_pending` queue: entries which the yacy_grid_loader process reads. This process loads given resources and writes them to the asset storage.
52 | * `crawler_pending`queue: entries which the yacy_grid_crawler process reads. This process loads the content from the asset storage, parses the content and creates new loader_pending tasks.
53 | 
54 | The required indexes are:
55 | * a crawl profile index
56 | * a transaction index which reflects the crawl status
57 | * a search index
58 | 
59 | The microservices will create these indexes on their own using the MCP component.
60 | 
61 | ## Installation: Download, Build, Run
62 | At this time, yacy_grid_crawler is not provided in compiled form, you easily build it yourself. It's not difficult and done in one minute! The source code is hosted at https://github.com/yacy/yacy_grid_crawler, you can download it and run loklak with:
63 | 
64 |     > git clone --recursive https://github.com/yacy/yacy_grid_crawler.git
65 | 
66 | If you just want to make a update, do the following
67 | 
68 |     > git pull origin master
69 |     > git submodule foreach git pull origin master
70 | 
71 | To build and start the crawler, run
72 | 
73 |     > cd yacy_grid_crawler
74 |     > gradle run
75 | 
76 | Please read also https://github.com/yacy/yacy_grid_mcp/edit/master/README.md for further details.
77 | 
78 | ## Contribute
79 | 
80 | This is a community project and your contribution is welcome!
81 | 
82 | 1. Check for [open issues](https://github.com/yacy/yacy_grid_crawler/issues)
83 |    or open a fresh one to start a discussion around a feature idea or a bug.
84 | 2. Fork [the repository](https://github.com/yacy/yacy_grid_crawler.git)
85 |    on GitHub to start making your changes (branch off of the master branch).
86 | 3. Write a test that shows the bug was fixed or the feature works as expected.
87 | 4. Send a pull request and bug us on Gitter until it gets merged and published. :)
88 | 
89 | ## What is the software license?
90 | LGPL 2.1
91 | 
92 | Have fun!
93 | 
94 | @0rb1t3r
95 | 


--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/Blacklist.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  Blacklist
  3 |  *  Copyright 17.02.2018 by Michael Peter Christen, @0rb1t3r
  4 |  *
  5 |  *  This library is free software; you can redistribute it and/or
  6 |  *  modify it under the terms of the GNU Lesser General Public
  7 |  *  License as published by the Free Software Foundation; either
  8 |  *  version 2.1 of the License, or (at your option) any later version.
  9 |  *
 10 |  *  This library is distributed in the hope that it will be useful,
 11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 |  *  Lesser General Public License for more details.
 14 |  *
 15 |  *  You should have received a copy of the GNU Lesser General Public License
 16 |  *  along with this program in the file lgpl21.txt
 17 |  *  If not, see <http://www.gnu.org/licenses/>.
 18 |  */
 19 | 
 20 | 
 21 | package net.yacy.grid.crawler;
 22 | 
 23 | import java.io.File;
 24 | import java.io.IOException;
 25 | import java.nio.charset.StandardCharsets;
 26 | import java.nio.file.Files;
 27 | import java.util.ArrayList;
 28 | import java.util.List;
 29 | import java.util.concurrent.atomic.AtomicInteger;
 30 | import java.util.regex.Matcher;
 31 | import java.util.regex.Pattern;
 32 | import java.util.regex.PatternSyntaxException;
 33 | 
 34 | import net.yacy.grid.tools.ARC;
 35 | import net.yacy.grid.tools.HashARC;
 36 | import net.yacy.grid.tools.Logger;
 37 | import net.yacy.grid.tools.MultiProtocolURL;
 38 | 
 39 | /**
 40 |  * A blacklist class to test if an URL is blacklisted.
 41 |  * This class has no object synchronization and it must not be used in concurrent environment.
 42 |  * The lack of concurrency is done on purpose. Each concurrent thread must initialize it's own blacklist.
 43 |  * This ensures that no concurrency issue appears between threads using the same blacklist.
 44 |  */
 45 | public class Blacklist {
 46 | 
 47 |     private final ARC<String, BlacklistInfo> blacklistHitCache;
 48 |     private final ARC<String, Boolean> blacklistMissCache;
 49 |     private final List<BlacklistInfo> blacklist;
 50 | 
 51 |     public Blacklist() {
 52 |         this.blacklist = new ArrayList<>();
 53 |         this.blacklistHitCache = new HashARC<>(100000);
 54 |         this.blacklistMissCache = new HashARC<>(100000);
 55 |     }
 56 | 
 57 |     public void load(File f) throws IOException {
 58 |         final AtomicInteger counter = new AtomicInteger(0);
 59 |         Files.lines(f.toPath(), StandardCharsets.UTF_8).forEach(line -> {
 60 |             line = line.trim();
 61 |             int p = line.indexOf(" #");
 62 |             String info = "";
 63 |             if (p >= 0) {
 64 |                 info = line.substring(p + 1).trim();
 65 |                 line = line.substring(0, p);
 66 |             }
 67 |             line = line.trim();
 68 |             if (!line.isEmpty() && !line.startsWith("#")) {
 69 |                 if (line.startsWith("host ")) {
 70 |                     String host = line.substring(5).trim();
 71 |                     try {
 72 |                         BlacklistInfo bi = new BlacklistInfo(".*?//" + host + "/.*+", f.getName(), info, host);
 73 |                         this.blacklist.add(bi);
 74 |                         counter.incrementAndGet();
 75 |                     } catch (PatternSyntaxException e) {
 76 |                         Logger.warn(this.getClass(), "regex for host in file " + f.getName() + " cannot be compiled: " + line.substring(5).trim());
 77 |                     }
 78 |                 } else {
 79 |                     try {
 80 |                         BlacklistInfo bi = new BlacklistInfo(line, f.getName(), info, null);
 81 |                         this.blacklist.add(bi);
 82 |                         counter.incrementAndGet();
 83 |                     } catch (PatternSyntaxException e) {
 84 |                         Logger.warn(this.getClass(), "regex for url in file " + f.getName() + " cannot be compiled: " + line);
 85 |                     }
 86 |                 }
 87 |             }
 88 |         });
 89 |         Logger.info(this.getClass(), "loaded " + counter.get() + " blacklist entries from file " + f.getName());
 90 |     }
 91 | 
 92 |     public final static class BlacklistInfo {
 93 |         public final Matcher matcher;
 94 |         public final String source;
 95 |         public final String info;
 96 |         public final String host;
 97 |         public BlacklistInfo(final String patternString, final String source, final String info, final String host) throws PatternSyntaxException {
 98 |             this.matcher = Pattern.compile(patternString).matcher("");
 99 |             this.source = source;
100 |             this.info = info;
101 |             this.host = host;
102 |         }
103 |     }
104 | 
105 |     public BlacklistInfo isBlacklisted(String url, MultiProtocolURL u) {
106 |         BlacklistInfo cachedBI = this.blacklistHitCache.get(url);
107 |         if (cachedBI != null) return cachedBI;
108 |         Boolean cachedMiss = this.blacklistMissCache.get(url);
109 |         if (cachedMiss != null) return null;
110 |         for (BlacklistInfo bi: this.blacklist) {
111 |             if (u != null && bi.host != null) {
112 |                 if (u.getHost().equals(bi.host)) {
113 |                     return bi;
114 |                 }
115 |             } else {
116 |                 bi.matcher.reset(url);
117 |                 //Thread.currentThread().setName(bi.matcher.pattern().pattern() + " -> " + url);
118 |                 if (bi.matcher.matches()) {
119 |                     this.blacklistHitCache.put(url, bi);
120 |                     return bi;
121 |                 }
122 |             }
123 |         }
124 |         this.blacklistMissCache.put(url, Boolean.TRUE);
125 |         return null;
126 |     }
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/Crawler.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  Crawler
  3 |  *  Copyright 25.04.2017 by Michael Peter Christen, @0rb1t3r
  4 |  *
  5 |  *  This library is free software; you can redistribute it and/or
  6 |  *  modify it under the terms of the GNU Lesser General Public
  7 |  *  License as published by the Free Software Foundation; either
  8 |  *  version 2.1 of the License, or (at your option) any later version.
  9 |  *
 10 |  *  This library is distributed in the hope that it will be useful,
 11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 |  *  Lesser General Public License for more details.
 14 |  *
 15 |  *  You should have received a copy of the GNU Lesser General Public License
 16 |  *  along with this program in the file lgpl21.txt
 17 |  *  If not, see <http://www.gnu.org/licenses/>.
 18 |  */
 19 | 
 20 | package net.yacy.grid.crawler;
 21 | 
 22 | import java.util.ArrayList;
 23 | import java.util.Arrays;
 24 | import java.util.List;
 25 | 
 26 | import javax.servlet.Servlet;
 27 | 
 28 | import net.yacy.grid.YaCyServices;
 29 | import net.yacy.grid.crawler.api.CrawlStartService;
 30 | import net.yacy.grid.crawler.api.CrawlerDefaultValuesService;
 31 | import net.yacy.grid.mcp.BrokerListener;
 32 | import net.yacy.grid.mcp.Configuration;
 33 | import net.yacy.grid.mcp.MCP;
 34 | import net.yacy.grid.mcp.Service;
 35 | import net.yacy.grid.tools.CronBox;
 36 | import net.yacy.grid.tools.CronBox.Telemetry;
 37 | import net.yacy.grid.tools.Logger;
 38 | 
 39 | /**
 40 |  * The Crawler main class
 41 |  *
 42 |  * performance debugging:
 43 |  * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt
 44 |  * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt?count=100
 45 |  */
 46 | public class Crawler {
 47 | 
 48 |     private final static YaCyServices CRAWLER_SERVICE = YaCyServices.crawler;
 49 |     private final static String DATA_PATH = "data";
 50 | 
 51 |     // define services
 52 |     @SuppressWarnings("unchecked")
 53 |     public final static Class<? extends Servlet>[] CRAWLER_SERVICES = new Class[]{
 54 |             CrawlerDefaultValuesService.class,
 55 |             CrawlStartService.class
 56 |     };
 57 | 
 58 |     public static class Application implements CronBox.Application {
 59 | 
 60 |         final Configuration config;
 61 |         final Service service;
 62 |         final BrokerListener brokerApplication;
 63 |         final CronBox.Application serviceApplication;
 64 | 
 65 |         public Application() {
 66 |             Logger.info("Starting Crawler Application...");
 67 | 
 68 |             // initialize configuration
 69 |             final List<Class<? extends Servlet>> services = new ArrayList<>();
 70 |             services.addAll(Arrays.asList(MCP.MCP_SERVLETS));
 71 |             services.addAll(Arrays.asList(CRAWLER_SERVICES));
 72 |             this.config =  new Configuration(DATA_PATH, true, CRAWLER_SERVICE, services.toArray(new Class[services.size()]));
 73 |             final int priorityQueues = Integer.parseInt(this.config.properties.get("grid.indexer.priorityQueues"));
 74 |             CrawlerListener.initPriorityQueue(priorityQueues);
 75 | 
 76 |             // initialize REST server with services
 77 |             this.service = new Service(this.config);
 78 | 
 79 |             // connect backend
 80 |             this.config.connectBackend();
 81 | 
 82 |             // initiate broker application: listening to indexing requests at RabbitMQ
 83 |             this.brokerApplication = new CrawlerListener(this.config, CRAWLER_SERVICE);
 84 | 
 85 |             // initiate service application: listening to REST request
 86 |             this.serviceApplication = this.service.newServer(null);
 87 |         }
 88 | 
 89 |         @Override
 90 |         public void run() {
 91 | 
 92 |             Logger.info("Grid Name: " + this.config.properties.get("grid.name"));
 93 | 
 94 |             // starting threads
 95 |             new Thread(this.brokerApplication).start();
 96 |             this.serviceApplication.run(); // SIC! the service application is running as the core element of this run() process. If we run it concurrently, this runnable will be "dead".
 97 |         }
 98 | 
 99 |         @Override
100 |         public void stop() {
101 |             Logger.info("Stopping Crawler Application...");
102 |             this.serviceApplication.stop();
103 |             this.brokerApplication.stop();
104 |             this.service.stop();
105 |             this.service.close();
106 |             this.config.close();
107 |         }
108 | 
109 |         @Override
110 |         public Telemetry getTelemetry() {
111 |             return null;
112 |         }
113 | 
114 |     }
115 | 
116 |     public static void main(final String[] args) {
117 |         // run in headless mode
118 |         System.setProperty("java.awt.headless", "true"); // no awt used here so we can switch off that stuff
119 | 
120 |         // Debug Info
121 |         boolean assertionenabled = false;
122 |         assert (assertionenabled = true) == true; // compare to true to remove warning: "Possible accidental assignement"
123 |         if (assertionenabled) Logger.info("Asserts are enabled");
124 | 
125 |         // first greeting
126 |         Logger.info("YaCy Grid Crawler started!");
127 | 
128 |         // run application with cron
129 |         final long cycleDelay = Long.parseLong(System.getProperty("YACYGRID_CRAWLER_CYCLEDELAY", "" + Long.MAX_VALUE)); // by default, run only in one genesis thread
130 |         final int cycleRandom = Integer.parseInt(System.getProperty("YACYGRID_CRAWLER_CYCLERANDOM", "" + 1000 * 60 /*1 minute*/));
131 |         final CronBox cron = new CronBox(Application.class, cycleDelay, cycleRandom);
132 |         cron.cycle();
133 | 
134 |         // this line is reached if the cron process was shut down
135 |         Logger.info("YaCy Grid Crawler terminated");
136 |     }
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | #
  4 | # Copyright © 2015-2021 the original authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | #
 21 | #   Gradle start up script for POSIX generated by Gradle.
 22 | #
 23 | #   Important for running:
 24 | #
 25 | #   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
 26 | #       noncompliant, but you have some other compliant shell such as ksh or
 27 | #       bash, then to run this script, type that shell name before the whole
 28 | #       command line, like:
 29 | #
 30 | #           ksh Gradle
 31 | #
 32 | #       Busybox and similar reduced shells will NOT work, because this script
 33 | #       requires all of these POSIX shell features:
 34 | #         * functions;
 35 | #         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
 36 | #           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
 37 | #         * compound commands having a testable exit status, especially «case»;
 38 | #         * various built-in commands including «command», «set», and «ulimit».
 39 | #
 40 | #   Important for patching:
 41 | #
 42 | #   (2) This script targets any POSIX shell, so it avoids extensions provided
 43 | #       by Bash, Ksh, etc; in particular arrays are avoided.
 44 | #
 45 | #       The "traditional" practice of packing multiple parameters into a
 46 | #       space-separated string is a well documented source of bugs and security
 47 | #       problems, so this is (mostly) avoided, by progressively accumulating
 48 | #       options in "$@", and eventually passing that to Java.
 49 | #
 50 | #       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
 51 | #       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
 52 | #       see the in-line comments for details.
 53 | #
 54 | #       There are tweaks for specific operating systems such as AIX, CygWin,
 55 | #       Darwin, MinGW, and NonStop.
 56 | #
 57 | #   (3) This script is generated from the Groovy template
 58 | #       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
 59 | #       within the Gradle project.
 60 | #
 61 | #       You can find Gradle at https://github.com/gradle/gradle/.
 62 | #
 63 | ##############################################################################
 64 | 
 65 | # Attempt to set APP_HOME
 66 | 
 67 | # Resolve links: $0 may be a link
 68 | app_path=$0
 69 | 
 70 | # Need this for daisy-chained symlinks.
 71 | while
 72 |     APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
 73 |     [ -h "$app_path" ]
 74 | do
 75 |     ls=$( ls -ld "$app_path" )
 76 |     link=${ls#*' -> '}
 77 |     case $link in             #(
 78 |       /*)   app_path=$link ;; #(
 79 |       *)    app_path=$APP_HOME$link ;;
 80 |     esac
 81 | done
 82 | 
 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
 84 | 
 85 | APP_NAME="Gradle"
 86 | APP_BASE_NAME=${0##*/}
 87 | 
 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 90 | 
 91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 92 | MAX_FD=maximum
 93 | 
 94 | warn () {
 95 |     echo "$*"
 96 | } >&2
 97 | 
 98 | die () {
 99 |     echo
100 |     echo "$*"
101 |     echo
102 |     exit 1
103 | } >&2
104 | 
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in                #(
111 |   CYGWIN* )         cygwin=true  ;; #(
112 |   Darwin* )         darwin=true  ;; #(
113 |   MSYS* | MINGW* )  msys=true    ;; #(
114 |   NONSTOP* )        nonstop=true ;;
115 | esac
116 | 
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 | 
119 | 
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 |         # IBM's JDK on AIX uses strange locations for the executables
124 |         JAVACMD=$JAVA_HOME/jre/sh/java
125 |     else
126 |         JAVACMD=$JAVA_HOME/bin/java
127 |     fi
128 |     if [ ! -x "$JAVACMD" ] ; then
129 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 | 
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 |     fi
134 | else
135 |     JAVACMD=java
136 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 | 
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 | 
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 |     case $MAX_FD in #(
145 |       max*)
146 |         MAX_FD=$( ulimit -H -n ) ||
147 |             warn "Could not query maximum file descriptor limit"
148 |     esac
149 |     case $MAX_FD in  #(
150 |       '' | soft) :;; #(
151 |       *)
152 |         ulimit -n "$MAX_FD" ||
153 |             warn "Could not set maximum file descriptor limit to $MAX_FD"
154 |     esac
155 | fi
156 | 
157 | # Collect all arguments for the java command, stacking in reverse order:
158 | #   * args from the command line
159 | #   * the main class name
160 | #   * -classpath
161 | #   * -D...appname settings
162 | #   * --module-path (only if needed)
163 | #   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
164 | 
165 | # For Cygwin or MSYS, switch paths to Windows format before running java
166 | if "$cygwin" || "$msys" ; then
167 |     APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
168 |     CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
169 | 
170 |     JAVACMD=$( cygpath --unix "$JAVACMD" )
171 | 
172 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
173 |     for arg do
174 |         if
175 |             case $arg in                                #(
176 |               -*)   false ;;                            # don't mess with options #(
177 |               /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
178 |                     [ -e "$t" ] ;;                      #(
179 |               *)    false ;;
180 |             esac
181 |         then
182 |             arg=$( cygpath --path --ignore --mixed "$arg" )
183 |         fi
184 |         # Roll the args list around exactly as many times as the number of
185 |         # args, so each arg winds up back in the position where it started, but
186 |         # possibly modified.
187 |         #
188 |         # NB: a `for` loop captures its iteration list before it begins, so
189 |         # changing the positional parameters here affects neither the number of
190 |         # iterations, nor the values presented in `arg`.
191 |         shift                   # remove old arg
192 |         set -- "$@" "$arg"      # push replacement arg
193 |     done
194 | fi
195 | 
196 | # Collect all arguments for the java command;
197 | #   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
198 | #     shell script including quotes and variable substitutions, so put them in
199 | #     double quotes to make sure that they get re-expanded; and
200 | #   * put everything else in single quotes, so that it's not re-expanded.
201 | 
202 | set -- \
203 |         "-Dorg.gradle.appname=$APP_BASE_NAME" \
204 |         -classpath "$CLASSPATH" \
205 |         org.gradle.wrapper.GradleWrapperMain \
206 |         "$@"
207 | 
208 | # Use "xargs" to parse quoted args.
209 | #
210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
211 | #
212 | # In Bash we could simply go:
213 | #
214 | #   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
215 | #   set -- "${ARGS[@]}" "$@"
216 | #
217 | # but POSIX shell has neither arrays nor command substitution, so instead we
218 | # post-process each arg (as a line of input to sed) to backslash-escape any
219 | # character that might be a shell metacharacter, then use eval to reverse
220 | # that process (while maintaining the separation between arguments), and wrap
221 | # the whole thing up as a single "set" statement.
222 | #
223 | # This will of course break if any of these variables contains a newline or
224 | # an unmatched quote.
225 | #
226 | 
227 | eval "set -- $(
228 |         printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
229 |         xargs -n1 |
230 |         sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
231 |         tr '\n' ' '
232 |     )" '"$@"'
233 | 
234 | exec "$JAVACMD" "$@"
235 | 


--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/api/CrawlStartService.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  CrawlStartService
  3 |  *  Copyright 12.6.2017 by Michael Peter Christen, @0rb1t3r
  4 |  *
  5 |  *  This library is free software; you can redistribute it and/or
  6 |  *  modify it under the terms of the GNU Lesser General Public
  7 |  *  License as published by the Free Software Foundation; either
  8 |  *  version 2.1 of the License, or (at your option) any later version.
  9 |  *
 10 |  *  This library is distributed in the hope that it will be useful,
 11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 |  *  Lesser General Public License for more details.
 14 |  *
 15 |  *  You should have received a copy of the GNU Lesser General Public License
 16 |  *  along with this program in the file lgpl21.txt
 17 |  *  If not, see <http://www.gnu.org/licenses/>.
 18 |  */
 19 | 
 20 | package net.yacy.grid.crawler.api;
 21 | 
 22 | import java.io.IOException;
 23 | import java.nio.charset.StandardCharsets;
 24 | import java.util.Date;
 25 | import java.util.Map;
 26 | import java.util.regex.Pattern;
 27 | 
 28 | import javax.servlet.http.HttpServletResponse;
 29 | 
 30 | import org.json.JSONArray;
 31 | import org.json.JSONObject;
 32 | 
 33 | import ai.susi.mind.SusiAction;
 34 | import ai.susi.mind.SusiThought;
 35 | import net.yacy.grid.YaCyServices;
 36 | import net.yacy.grid.contracts.User;
 37 | import net.yacy.grid.crawler.CrawlerListener;
 38 | import net.yacy.grid.http.APIHandler;
 39 | import net.yacy.grid.http.ObjectAPIHandler;
 40 | import net.yacy.grid.http.Query;
 41 | import net.yacy.grid.http.ServiceResponse;
 42 | import net.yacy.grid.io.index.CrawlerMapping;
 43 | import net.yacy.grid.io.index.CrawlstartDocument;
 44 | import net.yacy.grid.io.index.CrawlstartMapping;
 45 | import net.yacy.grid.io.index.GridIndex;
 46 | import net.yacy.grid.io.index.Index.QueryLanguage;
 47 | import net.yacy.grid.io.index.WebMapping;
 48 | import net.yacy.grid.io.messages.GridQueue;
 49 | import net.yacy.grid.io.messages.ShardingMethod;
 50 | import net.yacy.grid.mcp.Service;
 51 | import net.yacy.grid.tools.Digest;
 52 | import net.yacy.grid.tools.Domains;
 53 | import net.yacy.grid.tools.JSONList;
 54 | import net.yacy.grid.tools.Logger;
 55 | import net.yacy.grid.tools.MultiProtocolURL;
 56 | 
 57 | /**
 58 |  *
 59 |  * Test URL:
 60 |  * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=yacy.net&indexmustnotmatch=.*Mitmachen.*&mustmatch=.*yacy.net.*
 61 |  * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=ix.de&crawlingDepth=6&priority=true
 62 |  * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=tagesschau.de&loaderHeadless=false
 63 |  *
 64 |  * then check crawl queue status at http://localhost:15672/
 65 |  * default account is guest:guest
 66 |  */
 67 | public class CrawlStartService extends ObjectAPIHandler implements APIHandler {
 68 | 
 69 |     private static final long serialVersionUID = 8578474303031749879L;
 70 |     public static final String NAME = "crawlStart";
 71 | 
 72 |     @Override
 73 |     public String getAPIPath() {
 74 |         return "/yacy/grid/crawler/" + NAME + ".json";
 75 |     }
 76 | 
 77 |     @Override
 78 |     public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) {
 79 |         final JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
 80 | 
 81 |         // read call attributes using the default crawlstart key names
 82 |         for (final String key: crawlstart.keySet()) {
 83 |             final Object object = crawlstart.get(key);
 84 |             if (object instanceof String) crawlstart.put(key, call.get(key, crawlstart.getString(key)));
 85 |             else if (object instanceof Integer) crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
 86 |             else if (object instanceof Long) crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
 87 |             else if (object instanceof JSONArray) {
 88 |                 final JSONArray a = crawlstart.getJSONArray(key);
 89 |                 final Object cv = call.get(key);
 90 |                 if (cv != null) crawlstart.put(key, cv);
 91 |             } else {
 92 |                 System.out.println("unrecognized type: " + object.getClass().toString());
 93 |             }
 94 |         }
 95 |         final String user_id = crawlstart.optString("user_id", User.ANONYMOUS_ID);
 96 | 
 97 |         // fix attributes
 98 |         final int crawlingDepth = crawlstart.optInt("crawlingDepth", 3);
 99 |         crawlstart.put("crawlingDepth", Math.min(crawlingDepth, 8)); // crawlingDepth shall not exceed 8 - this is used for enhanced balancing to be able to reach crawl leaves
100 |         final String mustmatch = crawlstart.optString("mustmatch", CrawlerDefaultValuesService.defaultValues.getString("mustmatch")).trim();
101 |         crawlstart.put("mustmatch", mustmatch);
102 |         final Map<String, Pattern> collections = WebMapping.collectionParser(crawlstart.optString("collection").trim());
103 | 
104 |         // set the crawl id
105 |         final CrawlerListener.CrawlstartURLSplitter crawlstartURLs = new CrawlerListener.CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
106 |         final Date now = new Date();
107 |         // start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
108 |         final SusiThought allCrawlstarts = new SusiThought();
109 |         int count = 0;
110 |         for (final MultiProtocolURL url: crawlstartURLs.getURLs()) {
111 |             final JSONObject singlecrawl = new JSONObject();
112 |             for (final String key: crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key)); // create a clone of crawlstart
113 |             final String crawl_id = CrawlerListener.getCrawlID(url, now, count++);
114 |             final String start_url = url.toNormalform(true);
115 |             final String start_ssld = Domains.getSmartSLD(url.getHost());
116 |             singlecrawl.put("id", crawl_id);
117 |             singlecrawl.put("user_id", user_id);
118 |             singlecrawl.put("start_url", start_url);
119 |             singlecrawl.put("start_ssld", start_ssld);
120 | 
121 |             //singlecrawl.put("crawlingURLs", new JSONArray().put(url.toNormalform(true)));
122 | 
123 |             try {
124 |                 // Create a crawlstart index entry: this will keep track of all crawls that have been started.
125 |                 // once such an entry is created, it is never changed or deleted again by any YaCy Grid process.
126 |                 final CrawlstartDocument crawlstartDoc = new CrawlstartDocument()
127 |                         .setCrawlID(crawl_id)
128 |                         .setUserID(user_id)
129 |                         .setMustmatch(mustmatch)
130 |                         .setCollections(collections.keySet())
131 |                         .setCrawlstartURL(start_url)
132 |                         .setCrawlstartSSLD(start_ssld)
133 |                         .setInitDate(now)
134 |                         .setData(singlecrawl);
135 |                 crawlstartDoc.store(Service.instance.config, Service.instance.config.gridIndex);
136 | 
137 |                 // Create a crawler url tracking index entry: this will keep track of single urls and their status
138 |                 // While it is processed. The entry also serves as a double-check entry to terminate a crawl even if the
139 |                 // crawler is restarted.
140 | 
141 |                 // delete the start url
142 |                 final String url_id = Digest.encodeMD5Hex(start_url);
143 |                 final String crawlerIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER);
144 |                 final String crawlstartIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawlstart", GridIndex.DEFAULT_INDEXNAME_CRAWLSTART);
145 |                 long deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"_id\":\"" + url_id + "\"}");
146 |                 Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for _id");
147 | 
148 |                 // Because 'old' crawls may block new ones we identify possible blocking entries using the mustmatch pattern.
149 |                 // We therefore delete all entries with the same mustmatch pattern before a crawl starts.
150 |                 if (mustmatch.equals(".*")) {
151 |                     // we cannot delete all wide crawl status urls!
152 |                     final JSONList old_crawls = Service.instance.config.gridIndex.query(crawlstartIndexName, QueryLanguage.fields, "{ \"" + CrawlstartMapping.start_url_s.name() + "\":\"" + start_url + "\"}", 0, 100);
153 |                     // from there we pick out the crawl start id and delete using them
154 |                     for (final Object j: old_crawls.toArray()) {
155 |                         final String crawlid = ((JSONObject) j).optString(CrawlstartMapping.crawl_id_s.name());
156 |                         if (crawlid.length() > 0) {
157 |                             deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.crawl_id_s.name() + "\":\"" + crawlid + "\"}");
158 |                             Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for crawl_id_s");
159 |                         }
160 |                     }
161 |                     // we also delete all entries with same start_url and start_ssld
162 |                     deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_url_s.name() + "\":\"" + start_url + "\"}");
163 |                     Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_url_s");
164 |                     deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_ssld_s.name() + "\":\"" + start_ssld + "\"}");
165 |                     Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_ssld_s");
166 |                 } else {
167 |                     // this should fit exactly on the old urls
168 |                     // test url:
169 |                     // curl -s -H 'Content-Type: application/json' -X GET http://localhost:9200/crawler/_search?q=_id:0a800a8ec1cc76b5eb8412ec494babc9 | python3 -m json.tool
170 |                     final String deletequery = "{ \"" + CrawlerMapping.mustmatch_s.name() + "\":\"" + mustmatch.replace("\\", "\\\\") + "\"}";
171 |                     deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, deletequery);
172 |                     Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries");
173 |                 }
174 |                 // we do not create a crawler document entry here because that would conflict with the double check.
175 |                 // crawler documents must be written after the double check has happened.
176 | 
177 |                 // create a crawl queue entry
178 |                 final GridQueue queueName = Service.instance.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.BALANCE, CrawlerListener.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
179 |                 final SusiThought json = new SusiThought();
180 |                 json.setData(new JSONArray().put(singlecrawl));
181 |                 final JSONObject action = new JSONObject()
182 |                         .put("type", YaCyServices.crawler.name())
183 |                         .put("queue", queueName.name())
184 |                         .put("id", crawl_id)
185 |                         .put("user_id", user_id)
186 |                         .put("depth", 0)
187 |                         .put("sourcegraph", "rootasset");
188 |                 final SusiAction crawlAction = new SusiAction(action);
189 |                 final JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), start_url);
190 |                 crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
191 |                 json.addAction(crawlAction);
192 |                 allCrawlstarts.addAction(crawlAction);
193 |                 final byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
194 |                 Service.instance.config.gridBroker.send(YaCyServices.crawler, queueName, b);
195 | 
196 |             } catch (final IOException e) {
197 |                 Logger.warn(this.getClass(), "error when starting crawl for " + url.toNormalform(true), e);
198 |                 allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
199 |             }
200 |         }
201 | 
202 |         // construct a crawl start message
203 |         allCrawlstarts.setData(new JSONArray().put(crawlstart));
204 |         allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
205 | 
206 |         // finally add the crawl start on the queue
207 |         return new ServiceResponse(allCrawlstarts);
208 |     }
209 | 
210 | }
211 | 
212 | 


--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/CrawlerListener.java:
--------------------------------------------------------------------------------
  1 | package net.yacy.grid.crawler;
  2 | 
  3 | import java.io.ByteArrayInputStream;
  4 | import java.io.File;
  5 | import java.io.IOException;
  6 | import java.io.Serializable;
  7 | import java.net.MalformedURLException;
  8 | import java.nio.charset.StandardCharsets;
  9 | import java.text.SimpleDateFormat;
 10 | import java.util.ArrayList;
 11 | import java.util.Collection;
 12 | import java.util.Date;
 13 | import java.util.HashMap;
 14 | import java.util.HashSet;
 15 | import java.util.Iterator;
 16 | import java.util.List;
 17 | import java.util.Locale;
 18 | import java.util.Map;
 19 | import java.util.Set;
 20 | import java.util.concurrent.ConcurrentHashMap;
 21 | import java.util.regex.Pattern;
 22 | 
 23 | import org.json.JSONArray;
 24 | import org.json.JSONObject;
 25 | 
 26 | import ai.susi.mind.SusiAction;
 27 | import ai.susi.mind.SusiThought;
 28 | import net.yacy.grid.Services;
 29 | import net.yacy.grid.YaCyServices;
 30 | import net.yacy.grid.contracts.User;
 31 | import net.yacy.grid.io.assets.Asset;
 32 | import net.yacy.grid.io.index.CrawlerDocument;
 33 | import net.yacy.grid.io.index.CrawlerDocument.Status;
 34 | import net.yacy.grid.io.index.GridIndex;
 35 | import net.yacy.grid.io.index.WebMapping;
 36 | import net.yacy.grid.io.messages.GridQueue;
 37 | import net.yacy.grid.io.messages.ShardingMethod;
 38 | import net.yacy.grid.mcp.AbstractBrokerListener;
 39 | import net.yacy.grid.mcp.BrokerListener;
 40 | import net.yacy.grid.mcp.Configuration;
 41 | import net.yacy.grid.tools.Classification.ContentDomain;
 42 | import net.yacy.grid.tools.CronBox.Telemetry;
 43 | import net.yacy.grid.tools.DateParser;
 44 | import net.yacy.grid.tools.Digest;
 45 | import net.yacy.grid.tools.JSONList;
 46 | import net.yacy.grid.tools.Logger;
 47 | import net.yacy.grid.tools.MultiProtocolURL;
 48 | 
 49 | 
 50 | public class CrawlerListener extends AbstractBrokerListener implements BrokerListener {
 51 | 
 52 |     private final static String[] FIELDS_IN_GRAPH = new String[]{
 53 |             WebMapping.inboundlinks_sxt.name(),
 54 |             WebMapping.outboundlinks_sxt.name(),
 55 |             //WebMapping.images_sxt.name(),
 56 |             WebMapping.frames_sxt.name(),
 57 |             WebMapping.iframes_sxt.name()
 58 |     };
 59 | 
 60 |     private final static String PATTERN_TIMEF = "YYYYMMddHHmmssSSS";
 61 | 
 62 |     public static int[] CRAWLER_PRIORITY_DIMENSIONS = YaCyServices.crawler.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.crawler.getSourceQueues().length - 1, 1};
 63 |     private static int[] LOADER_PRIORITY_DIMENSIONS = YaCyServices.loader.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.loader.getSourceQueues().length - 1, 1};
 64 |     private static int[] PARSER_PRIORITY_DIMENSIONS = YaCyServices.parser.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.parser.getSourceQueues().length - 1, 1};
 65 |     private static int[] INDEXER_PRIORITY_DIMENSIONS = YaCyServices.indexer.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.indexer.getSourceQueues().length - 1, 1};
 66 | 
 67 |     static void initPriorityQueue(final int priorityDimension) {
 68 |         CRAWLER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.crawler, priorityDimension);
 69 |         LOADER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.loader, priorityDimension);
 70 |         PARSER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.parser, priorityDimension);
 71 |         INDEXER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.indexer, priorityDimension);
 72 |     }
 73 | 
 74 |     private static int[] priorityDimensions(final YaCyServices service, final int d) {
 75 |         return service.getSourceQueues().length <= d ? new int[] {service.getSourceQueues().length, 0} : new int[] {service.getSourceQueues().length - d, d};
 76 |     }
 77 | 
 78 |     private final String[] blacklist_crawler_names_list, blacklist_indexer_names_list;
 79 |     private final Map<String, Blacklist> blacklists_crawler, blacklists_indexer;
 80 | 
 81 |     //private final static Map<String, DoubleCache> doubles = Service.hazelcast.getMap("doubles");
 82 |     private final Map<String, DoubleCache> doubles = new ConcurrentHashMap<>();
 83 |     private static long doublesLastCleanup = System.currentTimeMillis();
 84 |     private final static long doublesCleanupTimeout = 1000L * 60L * 60L * 24L * 7L; // cleanup after 7 days
 85 |     private final static long doublesCleanupPeriod = 1000L * 60L * 10L; // do cleanup each 10 minutes
 86 |     private static class DoubleCache implements Serializable {
 87 |         private static final long serialVersionUID = 614262945713636851L;
 88 |         public Set<String> doubleHashes;
 89 |         public long time;
 90 |         public DoubleCache() {
 91 |             this.time = System.currentTimeMillis();
 92 |             this.doubleHashes = ConcurrentHashMap.newKeySet();
 93 |         }
 94 |     }
 95 | 
 96 |     private void doDoubleCleanup() {
 97 |         final long now = System.currentTimeMillis();
 98 |         if (now - doublesLastCleanup < doublesCleanupPeriod) return;
 99 |         doublesLastCleanup = now;
100 |         final Iterator<Map.Entry<String, DoubleCache>> i = this.doubles.entrySet().iterator();
101 |         while (i.hasNext()) {
102 |             final Map.Entry<String, DoubleCache> cache = i.next();
103 |             if ((now - cache.getValue().time) > doublesCleanupTimeout) {
104 |                 cache.getValue().doubleHashes.clear();
105 |                 i.remove();
106 |             }
107 |         }
108 |     }
109 | 
110 |     public static class CrawlstartURLSplitter {
111 | 
112 |         private final List<MultiProtocolURL> crawlingURLArray;
113 |         private final List<String> badURLStrings;
114 | 
115 |         public CrawlstartURLSplitter(String crawlingURLsString) {
116 |             Logger.info(this.getClass(), "splitting url list: " + crawlingURLsString);
117 |             crawlingURLsString = crawlingURLsString.replaceAll("\\|http", "\nhttp").replaceAll("%7Chttp", "\nhttp").replaceAll("%0D%0A", "\n").replaceAll("%0A", "\n").replaceAll("%0D", "\n").replaceAll(" ", "\n");
118 |             final String[] crawlingURLs = crawlingURLsString.split("\n");
119 |             this.crawlingURLArray = new ArrayList<>();
120 |             this.badURLStrings = new ArrayList<>();
121 |             for (final String u: crawlingURLs) {
122 |                 try {
123 |                     final MultiProtocolURL url = new MultiProtocolURL(u);
124 |                     Logger.info(this.getClass(), "splitted url: " + url.toNormalform(true));
125 |                     this.crawlingURLArray.add(url);
126 |                 } catch (final MalformedURLException e) {
127 |                     this.badURLStrings.add(u);
128 |                     Logger.warn(this.getClass(), "error when starting crawl with splitter url " + u + "; splitted from " + crawlingURLsString, e);
129 |                 }
130 |             }
131 |         }
132 | 
133 |         public List<MultiProtocolURL> getURLs() {
134 |             return this.crawlingURLArray;
135 |         }
136 | 
137 |         public List<String> getBadURLs() {
138 |             return this.badURLStrings;
139 |         }
140 |     }
141 | 
142 |     public static String getCrawlID(final MultiProtocolURL url, final Date date, final int count) {
143 |         String id = url.getHost();
144 |         if (id.length() > 80) id = id.substring(0, 80) + "-" + id.hashCode();
145 |         id = id + "-" + DateParser.secondDateFormat.format(date).replace(':', '-').replace(' ', '-') + "-" + count;
146 |         return id;
147 |     }
148 | 
149 |     public CrawlerListener(final Configuration config, final YaCyServices service) {
150 |         super(config, service, Runtime.getRuntime().availableProcessors());
151 | 
152 |         this.blacklist_crawler_names_list = config.properties.get("grid.crawler.blacklist").split(",");
153 |         this.blacklist_indexer_names_list = config.properties.get("grid.indexer.blacklist").split(",");
154 |         this.blacklists_crawler = new ConcurrentHashMap<>();
155 |         this.blacklists_indexer = new ConcurrentHashMap<>();
156 |     }
157 | 
158 |     private final Blacklist getBlacklistCrawler(final String processName, final int processNumber) {
159 |         final String key = processName + "_" + processNumber;
160 |         Blacklist blacklist = this.blacklists_crawler.get(key);
161 |         if (blacklist == null) {
162 |             this.blacklists_crawler.put(key, blacklist = loadBlacklist(this.blacklist_crawler_names_list));
163 |         }
164 |         return blacklist;
165 |     }
166 | 
167 |     private final Blacklist getBlacklistIndexer(final String processName, final int processNumber) {
168 |         final String key = processName + "_" + processNumber;
169 |         Blacklist blacklist = this.blacklists_indexer.get(key);
170 |         if (blacklist == null) {
171 |             this.blacklists_indexer.put(key, blacklist = loadBlacklist(this.blacklist_indexer_names_list));
172 |         }
173 |         return blacklist;
174 |     }
175 | 
176 |     private final Blacklist loadBlacklist(final String[] names) {
177 |         final Blacklist blacklist = new Blacklist();
178 |         for (final String name: names) {
179 |             File f = new File(super.config.gridServicePath, "conf/" + name.trim());
180 |             if (!f.exists()) f = new File("conf/" + name.trim());
181 |             if (!f.exists()) continue;
182 |             try {
183 |                 blacklist.load(f);
184 |             } catch (final IOException e) {
185 |                 Logger.warn(this.getClass(), e);
186 |             }
187 |         }
188 |         return blacklist;
189 |     }
190 | 
191 |     @Override
192 |     public ActionResult processAction(final SusiAction crawlaction, final JSONArray data, final String processName, final int processNumber) {
193 |         doDoubleCleanup();
194 |         final String crawl_id = crawlaction.getStringAttr("id");
195 |         String user_id = crawlaction.getStringAttr("user_id");
196 |         if (user_id == null || user_id.length() == 0) user_id = User.ANONYMOUS_ID;
197 |         JSONArray user_ids = crawlaction.getArrayAttr("user_ids");
198 |         if (user_ids == null) user_ids = new JSONArray();
199 |         if (user_id != null && user_id.length() > 0 && !user_ids.toList().contains(user_id)) user_ids.put(user_id);
200 | 
201 |         if (crawl_id == null || crawl_id.length() == 0) {
202 |             Logger.info("Crawler.processAction Fail: Action does not have an id: " + crawlaction.toString());
203 |             return ActionResult.FAIL_IRREVERSIBLE;
204 |         }
205 |         final JSONObject crawl = SusiThought.selectData(data, "id", crawl_id);
206 |         if (crawl == null) {
207 |             Logger.info(this.getClass(), "Crawler.processAction Fail: ID of Action not found in data: " + crawlaction.toString());
208 |             return ActionResult.FAIL_IRREVERSIBLE;
209 |         }
210 | 
211 |         final boolean archiveWARC = crawl.optBoolean("archiveWARC");
212 |         final boolean archiveIndex = crawl.optBoolean("archiveIndex");
213 |         final boolean archiveGraph = crawl.optBoolean("archiveGraph");
214 | 
215 |         final int depth = crawlaction.getIntAttr("depth");
216 |         final int crawlingDepth = crawl.getInt("crawlingDepth");
217 |         final int priority =  crawl.has("priority") ? crawl.getInt("priority") : 0;
218 |         // check depth (this check should be deprecated because we limit by omitting the crawl message at crawl tree leaves)
219 |         if (depth > crawlingDepth) {
220 |             // this is a leaf in the crawl tree (it does not mean that the crawl is finished)
221 |             Logger.info(this.getClass(), "Crawler.processAction Leaf: reached a crawl leaf for crawl " + crawl_id + ", depth = " + crawlingDepth);
222 |             return ActionResult.SUCCESS;
223 |         }
224 |         final boolean isCrawlLeaf = depth == crawlingDepth;
225 | 
226 |         // load graph
227 |         final String sourcegraph = crawlaction.getStringAttr("sourcegraph");
228 |         if (sourcegraph == null || sourcegraph.length() == 0) {
229 |             Logger.info(this.getClass(), "Crawler.processAction Fail: sourcegraph of Action is empty: " + crawlaction.toString());
230 |             return ActionResult.FAIL_IRREVERSIBLE;
231 |         }
232 |         try {
233 |             JSONList jsonlist = null;
234 |             if (crawlaction.hasAsset(sourcegraph)) {
235 |                 jsonlist = crawlaction.getJSONListAsset(sourcegraph);
236 |             }
237 |             if (jsonlist == null) try {
238 |                 final Asset<byte[]> graphasset = super.config.gridStorage.load(sourcegraph); // this must be a list of json, containing document links
239 |                 final byte[] graphassetbytes = graphasset.getPayload();
240 |                 jsonlist = new JSONList(new ByteArrayInputStream(graphassetbytes));
241 |             } catch (final IOException e) {
242 |                 Logger.warn(this.getClass(), "Crawler.processAction could not read asset from storage: " + sourcegraph, e);
243 |                 return ActionResult.FAIL_IRREVERSIBLE;
244 |             }
245 | 
246 |             // declare filter from the crawl profile
247 |             final String mustmatchs = crawl.optString("mustmatch");
248 |             final Pattern mustmatch = Pattern.compile(mustmatchs);
249 |             final String mustnotmatchs = crawl.optString("mustnotmatch");
250 |             final Pattern mustnotmatch = Pattern.compile(mustnotmatchs);
251 |             // filter for indexing steering
252 |             final String indexmustmatchs = crawl.optString("indexmustmatch");
253 |             final Pattern indexmustmatch = Pattern.compile(indexmustmatchs);
254 |             final String indexmustnotmatchs = crawl.optString("indexmustnotmatch");
255 |             final Pattern indexmustnotmatch = Pattern.compile(indexmustnotmatchs);
256 |             // attributes for new crawl entries
257 |             final String collectionss = crawl.optString("collection");
258 |             final Map<String, Pattern> collections = WebMapping.collectionParser(collectionss);
259 |             final String start_url = crawl.optString("start_url");
260 |             final String start_ssld = crawl.optString("start_ssld");
261 | 
262 |             final Date now = new Date();
263 |             final long timestamp = now.getTime();
264 |             // For each of the parsed document, there is a target graph.
265 |             // The graph contains all url elements which may appear in a document.
266 |             // In the following loop we collect all urls which may be of interest for the next depth of the crawl.
267 |             final Map<String, String> nextMap = new HashMap<>(); // a map from urlid to url
268 |             final Blacklist blacklist_crawler = getBlacklistCrawler(processName, processNumber);
269 |             final List<CrawlerDocument> crawlerDocuments = new ArrayList<>();
270 |             graphloop: for (int line = 0; line < jsonlist.length(); line++) {
271 |                 final JSONObject json = jsonlist.get(line);
272 |                 if (json.has("index")) continue graphloop; // this is an elasticsearch index directive, we just skip that
273 | 
274 |                 final String sourceurl = json.has(WebMapping.url_s.getMapping().name()) ? json.getString(WebMapping.url_s.getMapping().name()) : "";
275 |                 final Set<MultiProtocolURL> graph = new HashSet<>();
276 |                 final String graphurl = json.has(WebMapping.canonical_s.name()) ? json.getString(WebMapping.canonical_s.name()) : null;
277 |                 if (graphurl != null) try {
278 |                     graph.add(new MultiProtocolURL(graphurl));
279 |                 } catch (final MalformedURLException e) {
280 |                     Logger.warn(this.getClass(), "Crawler.processAction error when starting crawl with canonical url " + graphurl, e);
281 |                 }
282 |                 for (final String field: FIELDS_IN_GRAPH) {
283 |                     if (json.has(field)) {
284 |                         final JSONArray a = json.getJSONArray(field);
285 |                         urlloop: for (int i = 0; i < a.length(); i++) {
286 |                             final String u = a.getString(i);
287 |                             try {
288 |                                 graph.add(new MultiProtocolURL(u));
289 |                             } catch (final MalformedURLException e) {
290 |                                 Logger.warn(this.getClass(), "Crawler.processAction we discovered a bad follow-up url: " + u, e);
291 |                                 continue urlloop;
292 |                             }
293 |                         }
294 |                     }
295 |                 }
296 | 
297 |                 // sort out doubles and apply filters
298 |                 DoubleCache doublecache = null;
299 |                 if (!this.doubles.containsKey(crawl_id)) this.doubles.put(crawl_id, new DoubleCache());
300 |                 doublecache = this.doubles.get(crawl_id);
301 |                 Logger.info(this.getClass(), "Crawler.processAction processing sub-graph with " + graph.size() + " urls for url " + sourceurl);
302 |                 urlcheck: for (final MultiProtocolURL url: graph) {
303 |                     // prepare status document
304 |                     final ContentDomain cd = url.getContentDomainFromExt();
305 | 
306 |                     if (cd == ContentDomain.TEXT || cd == ContentDomain.ALL) {
307 |                         // check if the url shall be loaded using the constraints
308 |                         final String u = url.toNormalform(true);
309 |                         final String urlid = Digest.encodeMD5Hex(u);
310 | 
311 |                         // double check with the fast double cache
312 |                         if (doublecache.doubleHashes.contains(urlid)) {
313 |                             continue urlcheck;
314 |                         }
315 |                         doublecache.doubleHashes.add(urlid);
316 | 
317 |                         // create new crawl status document
318 |                         final CrawlerDocument crawlStatus = new CrawlerDocument()
319 |                                 .setCrawlID(crawl_id)
320 |                                 .setUserlID(user_id)
321 |                                 .setMustmatch(mustmatchs)
322 |                                 .setCollections(collections.keySet())
323 |                                 .setCrawlstartURL(start_url)
324 |                                 .setCrawlstartSSLD(start_ssld)
325 |                                 .setInitDate(now)
326 |                                 .setStatusDate(now)
327 |                                 .setURL(u);
328 | 
329 |                         // check matcher rules
330 |                         if (!mustmatch.matcher(u).matches() || mustnotmatch.matcher(u).matches()) {
331 |                             crawlStatus
332 |                             .setStatus(Status.rejected)
333 |                             .setComment(!mustmatch.matcher(u).matches() ? "url does not match must-match filter " + mustmatchs : "url matches mustnotmatch filter " + mustnotmatchs);
334 |                             crawlerDocuments.add(crawlStatus);
335 |                             continue urlcheck;
336 |                         }
337 | 
338 |                         // check blacklist (this is costly because the blacklist is huge)
339 |                         final Blacklist.BlacklistInfo blacklistInfo = blacklist_crawler.isBlacklisted(u, url);
340 |                         if (blacklistInfo != null) {
341 |                             Logger.info(this.getClass(), "Crawler.processAction crawler blacklist pattern '" + blacklistInfo.matcher.pattern().toString() + "' removed url '" + u + "' from crawl list " + blacklistInfo.source + ":  " + blacklistInfo.info);
342 |                             crawlStatus
343 |                             .setStatus(Status.rejected)
344 |                             .setComment("url matches blacklist");
345 |                             crawlerDocuments.add(crawlStatus);
346 |                             continue urlcheck;
347 |                         }
348 | 
349 |                         // double check with the elastic index (we do this late here because it is the most costly operation)
350 |                         //if (config.gridIndex.exist(GridIndex.CRAWLER_INDEX_NAME, GridIndex.EVENT_TYPE_NAME, urlid)) {
351 |                         //    continue urlcheck;
352 |                         //}
353 | 
354 |                         // add url to next stack
355 |                         nextMap.put(urlid, u);
356 |                     }
357 |                 };
358 |             }
359 | 
360 |             if (!nextMap.isEmpty()) {
361 | 
362 |                 // make a double-check
363 |                 final String crawlerIndexName = super.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER);
364 |                 final Set<String> exist = super.config.gridIndex.existBulk(crawlerIndexName, nextMap.keySet());
365 |                 for (final String u: exist) nextMap.remove(u);
366 |                 final Collection<String> nextList = nextMap.values(); // a set of urls
367 | 
368 |                 // divide the nextList into two sub-lists, one which will reach the indexer and another one which will not cause indexing
369 |                 @SuppressWarnings("unchecked")
370 |                 final
371 |                 List<String>[] indexNoIndex = new List[2];
372 |                 indexNoIndex[0] = new ArrayList<>(); // for: index
373 |                 indexNoIndex[1] = new ArrayList<>(); // for: no-Index
374 |                 final Blacklist blacklist_indexer = getBlacklistIndexer(processName, processNumber);
375 |                 nextList.forEach(url -> {
376 |                     final boolean indexConstratntFromCrawlProfil = indexmustmatch.matcher(url).matches() && !indexmustnotmatch.matcher(url).matches();
377 |                     final Blacklist.BlacklistInfo blacklistInfo = blacklist_indexer.isBlacklisted(url, null);
378 |                     final boolean indexConstraintFromBlacklist = blacklistInfo == null;
379 |                     if (indexConstratntFromCrawlProfil && indexConstraintFromBlacklist) {
380 |                         indexNoIndex[0].add(url);
381 |                     } else {
382 |                         indexNoIndex[1].add(url);
383 |                     }
384 |                 });
385 | 
386 |                 for (int ini = 0; ini < 2; ini++) {
387 | 
388 |                     // create crawler index entries
389 |                     for (final String u: indexNoIndex[ini]) {
390 |                         final CrawlerDocument crawlStatus = new CrawlerDocument()
391 |                                 .setCrawlID(crawl_id)
392 |                                 .setUserlID(user_id)
393 |                                 .setMustmatch(mustmatchs)
394 |                                 .setCollections(collections.keySet())
395 |                                 .setCrawlstartURL(start_url)
396 |                                 .setCrawlstartSSLD(start_ssld)
397 |                                 .setInitDate(now)
398 |                                 .setStatusDate(now)
399 |                                 .setStatus(Status.accepted)
400 |                                 .setURL(u)
401 |                                 .setComment(ini == 0 ? "to be indexed" : "noindex, just for crawling");
402 |                         crawlerDocuments.add(crawlStatus);
403 |                     }
404 | 
405 |                     // create partitions
406 |                     final List<JSONArray> partitions = createPartition(indexNoIndex[ini], 8);
407 | 
408 |                     // create follow-up crawl to next depth
409 |                     for (int pc = 0; pc < partitions.size(); pc++) {
410 |                         final JSONObject loaderAction = newLoaderAction(
411 |                                 priority, crawl_id, user_id, user_ids, partitions.get(pc), depth, isCrawlLeaf,
412 |                                 0, timestamp + ini, pc, depth < crawlingDepth, ini == 0,
413 |                                 archiveWARC, archiveIndex, archiveGraph); // action includes whole hierarchy of follow-up actions
414 |                         final SusiThought nextjson = new SusiThought()
415 |                                 .setData(data)
416 |                                 .addAction(new SusiAction(loaderAction));
417 | 
418 |                         // put a loader message on the queue
419 |                         final String message = nextjson.toString(2);
420 |                         final byte[] b = message.getBytes(StandardCharsets.UTF_8);
421 |                         try {
422 |                             final Services serviceName = YaCyServices.valueOf(loaderAction.getString("type"));
423 |                             final GridQueue queueName = new GridQueue(loaderAction.getString("queue"));
424 |                             super.config.gridBroker.send(serviceName, queueName, b);
425 |                         } catch (final IOException e) {
426 |                             Logger.warn(this.getClass(), "error when starting crawl with message " + message, e);
427 |                         }
428 |                     };
429 |                 }
430 |             }
431 |             // bulk-store the crawler documents
432 |             final Map<String, CrawlerDocument> crawlerDocumentsMap = new HashMap<>();
433 |             crawlerDocuments.forEach(crawlerDocument -> {
434 |                 final String url = crawlerDocument.getURL();
435 |                 if (url != null && url.length() > 0) {
436 |                     final String id = Digest.encodeMD5Hex(url);
437 |                     crawlerDocumentsMap.put(id, crawlerDocument);
438 |                 } else {
439 |                     assert false : "url not set / storeBulk";
440 |                 }
441 |             });
442 |             CrawlerDocument.storeBulk(super.config, super.config.gridIndex, crawlerDocumentsMap);
443 |             Logger.info(this.getClass(), "Crawler.processAction processed graph with " +  jsonlist.length()/2 + " subgraphs from " + sourcegraph);
444 |             return ActionResult.SUCCESS;
445 |         } catch (final Throwable e) {
446 |             Logger.warn(this.getClass(), "Crawler.processAction Fail: loading of sourcegraph failed: " + e.getMessage() /*+ "\n" + crawlaction.toString()*/, e);
447 |             return ActionResult.FAIL_IRREVERSIBLE;
448 |         }
449 |     }
450 | 
451 |     private static List<JSONArray> createPartition(final Collection<String> urls, final int partitionSize) {
452 |         final List<JSONArray> partitions = new ArrayList<>();
453 |         urls.forEach(url -> {
454 |             int c = partitions.size();
455 |             if (c == 0 || partitions.get(c - 1).length() >= partitionSize) {
456 |                 partitions.add(new JSONArray());
457 |                 c++;
458 |             }
459 |             partitions.get(c - 1).put(url);
460 |         });
461 |         return partitions;
462 |     }
463 | 
464 |     /**
465 |      * Create a new loader action. This action contains all follow-up actions after
466 |      * loading to create a steering of parser, indexing and follow-up crawler actions.
467 |      * @param priority the prioroty of the crawl
468 |      * @param id the crawl id
469 |      * @param user_id the id of the user (9 digit number)
470 |      * @param user_ids all users which have that domin as crawl assigned
471 |      * @param urls the urls which are part of the same actions
472 |      * @param depth the depth of the crawl step (0 is start depth)
473 |      * @param retry the number of load re-tries (0 is no retry, shows that this is the first attempt)
474 |      * @param timestamp the current time when the crawler created the action
475 |      * @param partition unique number of the url set partition. This is used to create asset names.
476 |      * @param doCrawling flag: if true, create a follow-up crawling action. set this to false to terminate crawling afterwards
477 |      * @param doIndexing flag: if true, do an indexing after loading. set this to false if the purpose is only a follow-up crawl after parsing
478 |      * @return the action json
479 |      * @throws IOException
480 |      */
481 |     private JSONObject newLoaderAction(
482 |             final int priority,
483 |             final String id,
484 |             final String user_id,
485 |             final JSONArray user_ids,
486 |             final JSONArray urls,
487 |             final int depth,
488 |             final boolean isCrawlLeaf,
489 |             final int retry,
490 |             final long timestamp,
491 |             final int partition,
492 |             final boolean doCrawling,
493 |             final boolean doIndexing,
494 |             final boolean archiveWARC,
495 |             final boolean archiveIndex,
496 |             final boolean archiveGraph) throws IOException {
497 |         // create file names for the assets: this uses depth and partition information
498 |         final SimpleDateFormat FORMAT_TIMEF = new SimpleDateFormat(PATTERN_TIMEF, Locale.US); // we must create this here to prevent concurrency bugs which are there in the date formatter :((
499 |         final String basepath  = "/data/aaaaa/accounting/" + user_id + "/";
500 |         final String docname  = "d" + intf(depth, 2) + "-t" + FORMAT_TIMEF.format(new Date(timestamp)) + "-p" + intf(partition, 4);
501 |         final String warcasset  =  basepath + "warc/"  + id + "/" + docname + ".warc.gz";
502 |         final String indexasset =  basepath + "index/" + id + "/" + docname + ".index.jsonlist";
503 |         final String graphasset =  basepath + "graph/" + id + "/" + docname + ".graph.jsonlist";
504 |         final String hashKey = new MultiProtocolURL(urls.getString(0)).getHost();
505 | 
506 |         // create actions to be done in reverse order:
507 |         // at the end of the processing we simultaneously place actions on the indexing and crawling queue
508 |         final JSONArray postParserActions = new JSONArray();
509 |         assert doIndexing || doCrawling; // one or both must be true; doing none of that does not make sense
510 |         // if all of the urls shall be indexed (see indexing patterns) then do indexing actions
511 |         if (doIndexing) {
512 |             final GridQueue indexerQueueName = super.config.gridBroker.queueName(YaCyServices.indexer, YaCyServices.indexer.getSourceQueues(), ShardingMethod.LEAST_FILLED, INDEXER_PRIORITY_DIMENSIONS, priority, hashKey);
513 |             postParserActions.put(new JSONObject(true)
514 |                     .put("type", YaCyServices.indexer.name())
515 |                     .put("queue", indexerQueueName.name())
516 |                     .put("id", id)
517 |                     .put("user_id", user_id)
518 |                     .put("user_ids", user_ids)
519 |                     .put("sourceasset", indexasset)
520 |                     .put("archiveindex", archiveIndex)
521 |                     );
522 |         }
523 |         // if all of the urls shall be crawled at depth + 1, add a crawling action. Don't do this only if the crawling depth is at the depth limit.
524 |         if (doCrawling) {
525 |             final GridQueue crawlerQueueName = super.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.LEAST_FILLED, CRAWLER_PRIORITY_DIMENSIONS, priority, hashKey);
526 |             postParserActions.put(new JSONObject(true)
527 |                     .put("type", YaCyServices.crawler.name())
528 |                     .put("queue", crawlerQueueName.name())
529 |                     .put("id", id)
530 |                     .put("user_id", user_id)
531 |                     .put("user_ids", user_ids)
532 |                     .put("depth", depth + 1)
533 |                     .put("sourcegraph", graphasset)
534 |                     .put("archivegraph", archiveGraph)
535 |                     );
536 |         }
537 | 
538 |         // before that and after loading we have a parsing action
539 |         final GridQueue parserQueueName = super.config.gridBroker.queueName(YaCyServices.parser, YaCyServices.parser.getSourceQueues(), ShardingMethod.LEAST_FILLED, PARSER_PRIORITY_DIMENSIONS, priority, hashKey);
540 |         final JSONArray parserActions = new JSONArray().put(new JSONObject(true)
541 |                 .put("type", YaCyServices.parser.name())
542 |                 .put("queue", parserQueueName.name())
543 |                 .put("id", id)
544 |                 .put("user_id", user_id)
545 |                 .put("user_ids", user_ids)
546 |                 .put("sourceasset", warcasset)
547 |                 .put("targetasset", indexasset)
548 |                 .put("targetgraph", graphasset)
549 |                 .put("archivewarc", archiveWARC)
550 |                 .put("archiveindex", archiveIndex)
551 |                 .put("archivegraph", archiveGraph)
552 |                 .put("actions", postParserActions)); // actions after parsing
553 | 
554 |         // at the beginning of the process, we do a loading.
555 |         final GridQueue loaderQueueName = super.config.gridBroker.queueName(YaCyServices.loader, YaCyServices.loader.getSourceQueues(), isCrawlLeaf ? ShardingMethod.LEAST_FILLED : ShardingMethod.BALANCE, LOADER_PRIORITY_DIMENSIONS, priority, hashKey);
556 |         final JSONObject loaderAction = new JSONObject(true)
557 |                 .put("type", YaCyServices.loader.name())
558 |                 .put("queue", loaderQueueName.name())
559 |                 .put("id", id)
560 |                 .put("user_id", user_id)
561 |                 .put("user_ids", user_ids)
562 |                 .put("urls", urls)
563 |                 .put("targetasset", warcasset)
564 |                 .put("archivewarc", archiveWARC)
565 |                 .put("actions", parserActions); // actions after loading
566 |         return loaderAction;
567 |     }
568 | 
569 |     private final static String intf(final int i, final int len) {
570 |         String s = Integer.toString(i);
571 |         while (s.length() < len) s = '0' + s;
572 |         return s;
573 |     }
574 | 
575 |     @Override
576 |     public Telemetry getTelemetry() {
577 |         return null;
578 |     }
579 | }
580 | 


--------------------------------------------------------------------------------