├── .github
└── FUNDING.yml
├── bin
├── restart.sh
├── start.sh
├── stop.sh
├── crawlstart.py
└── start_crawler_docker.sh
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── .dockerignore
├── .gitmodules
├── .gitignore
├── .settings
├── org.eclipse.jdt.core.prefs
└── org.eclipse.buildship.core.prefs
├── conf
├── indexer_blacklist_filetypes.txt
├── crawler_blacklist_localhost.txt
└── config.properties
├── src
└── main
│ ├── resources
│ └── log4j.properties
│ └── java
│ └── net
│ └── yacy
│ └── grid
│ └── crawler
│ ├── api
│ ├── CrawlerDefaultValuesService.java
│ └── CrawlStartService.java
│ ├── Blacklist.java
│ ├── Crawler.java
│ └── CrawlerListener.java
├── .project
├── Dockerfile
├── .classpath
├── gradlew.bat
├── README.md
└── gradlew
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: orbiter
2 | patreon: 0rb1t3r
3 |
--------------------------------------------------------------------------------
/bin/restart.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | cd "`dirname $0`"
3 | ./stop.sh
4 | sleep 1
5 | ./start.sh
6 |
7 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yacy/yacy_grid_crawler/HEAD/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .gitignore
3 | data
4 | build
5 | bin
6 | docker
7 | Dockerfile
8 | LICENSE.md
9 | README.md
10 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "submodules/yacy_grid_mcp"]
2 | path = submodules/yacy_grid_mcp
3 | url = https://github.com/yacy/yacy_grid_mcp.git
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | classes/
2 | target/
3 | data/
4 | /class/
5 | /.gradle/
6 | /build/
7 | .DS_Store
8 | .settings
9 | .idea/
10 | bin/ai/
11 | bin/log4j.properties
12 | bin/net/
13 | bin/org/
--------------------------------------------------------------------------------
/bin/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | cd "`dirname $0`"
3 | cd ..
4 | nohup java -jar build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar < /dev/null &
5 | sleep 1
6 | echo "YaCy Grid Crawler started!"
7 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
3 | org.eclipse.jdt.core.compiler.compliance=1.8
4 | org.eclipse.jdt.core.compiler.source=1.8
5 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/conf/indexer_blacklist_filetypes.txt:
--------------------------------------------------------------------------------
1 | # Indexing Blacklist for bad file types
2 |
3 | .*?\.xml # Reject XML in search index
4 | .*?\.css # Reject CSS in search index
5 | .*?\.js # Reject JavaScript in search index
6 | .*?/robots\.txt # Reject robots.txt
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=A1
3 |
4 | # A1 is set to be a ConsoleAppender.
5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
6 |
7 | # A1 uses PatternLayout.
8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
10 |
11 | log4j.logger.org.eclipse.jetty = INFO
12 | log4j.logger.org.apache.http = INFO
13 |
--------------------------------------------------------------------------------
/conf/crawler_blacklist_localhost.txt:
--------------------------------------------------------------------------------
1 | # Blacklist for local, private or intranet URLs
2 |
3 | .*?//localhost.*+ # Localhost host name
4 | .*?//127\..*+ # Localhost IPv4
5 | .*?//10\..*+ # Private IPv4 Class A Network 10.x.x.x
6 | .*?//172\.(1[6-9]|2[0-9]|3[0-1])\..*+ # Private IPv4 Class B Network 172.16.0.0 .. 172.31.255.255
7 | .*?//192\.168\..*+ # Private IPv4 Class C Network 192.168.0.0 .. 192.168.255.255
8 | .*?//^::1.*+ # Localhost IPv6
9 | .*?//[fF][cCdD].*+ # IPv6 User Local Address Space
10 |
--------------------------------------------------------------------------------
/bin/stop.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | cd "`dirname $0`"
3 | cd ../data
4 | KILLFILE="crawler-8300.kill"
5 | PIDFILE="crawler-8300.pid"
6 |
7 | # first method to terminate the process
8 | if [ -f "$KILLFILE" ];
9 | then
10 | rm $KILLFILE
11 | echo "termination requested, waiting.."
12 | # this can take 10 seconds..
13 | sleep 10
14 | fi
15 |
16 | # second method to terminate the process
17 | if [ -f "$PIDFILE" ];
18 | then
19 | fuser -k $PIDFILE
20 | fi
21 |
22 | # check if file does not exist any more which would be a sign that this has terminated
23 | if [ ! -f "$PIDFILE" ];
24 | then
25 | echo "process terminated"
26 | fi
27 |
28 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.buildship.core.prefs:
--------------------------------------------------------------------------------
1 | arguments=
2 | auto.sync=false
3 | build.commands=org.eclipse.jdt.core.javabuilder
4 | build.scans.enabled=false
5 | connection.arguments=
6 | connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(5.6.2))
7 | connection.java.home=null
8 | connection.jvm.arguments=
9 | connection.project.dir=
10 | derived.resources=.gradle,build
11 | eclipse.preferences.version=1
12 | gradle.user.home=
13 | java.home=
14 | jvm.arguments=
15 | natures=org.eclipse.jdt.core.javanature
16 | offline.mode=false
17 | override.workspace.settings=true
18 | project.path=\:
19 | show.console.view=true
20 | show.executions.view=true
21 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | yacy_grid_crawler
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.buildship.core.gradleprojectbuilder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.buildship.core.gradleprojectnature
21 | org.eclipse.jdt.core.javanature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ## yacy_grid_crawler dockerfile
2 | ## examples:
3 | # docker build -t yacy_grid_crawler .
4 | # docker run -d --rm -p 8300:8300 --name yacy_grid_crawler yacy_grid_crawler
5 | ## Check if the service is running:
6 | # curl http://localhost:8300/yacy/grid/mcp/info/status.json
7 |
8 | # build app
9 | FROM eclipse-temurin:8-jdk-focal AS appbuilder
10 | COPY ./ /app
11 | WORKDIR /app
12 | RUN ./gradlew clean shadowDistTar
13 |
14 | # build dist
15 | FROM eclipse-temurin:8-jre-focal
16 | LABEL maintainer="Michael Peter Christen "
17 | ENV DEBIAN_FRONTEND noninteractive
18 | ARG default_branch=master
19 | COPY ./conf /app/conf/
20 | COPY --from=appbuilder /app/build/libs/ ./app/build/libs/
21 | WORKDIR /app
22 | EXPOSE 8300
23 |
24 | # for some weird reason the jar file is sometimes not named correctly
25 | RUN if [ -e /app/build/libs/app-0.0.1-SNAPSHOT-all.jar ] ; then mv /app/build/libs/app-0.0.1-SNAPSHOT-all.jar /app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar; fi
26 |
27 | CMD ["java", "-Xms320M", "-Xmx2G", "-jar", "/app/build/libs/yacy_grid_crawler-0.0.1-SNAPSHOT-all.jar"]
28 |
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/bin/crawlstart.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | import sys
4 | import requests
5 | import urllib.parse
6 |
7 | crawlingURL = sys.argv[1]
8 | parsed_url = urllib.parse.urlparse(crawlingURL)
9 | crawlingHost = parsed_url.netloc
10 | crawlingProtocol = parsed_url.scheme
11 |
12 | data = {
13 | 'cachePolicy': 'iffresh',
14 | 'collection': 'testcollection',
15 | 'crawlingstart': 'Start crawling',
16 | 'crawlingMode': 'url',
17 | 'crawlingQ': 'on',
18 | 'crawlingDepth': 1,
19 | 'crawlingDepthExtension': '',
20 | 'crawlingURL': crawlingURL,
21 | 'deleteIfOlderNumber': 1,
22 | 'deleteIfOlderUnit': 'day',
23 | 'deleteold': 'age',
24 | 'indexmustmatch': '^{0}.*'.format(crawlingURL),
25 | 'indexmustnotmatch': '',
26 | 'indexMedia': 'on',
27 | 'mustmatch': '^{protocol}://{host}/.*'.format(protocol=crawlingProtocol, host=crawlingHost),
28 | 'mustnotmatch': '',
29 | 'indexText': 'on',
30 | 'range': 'wide',
31 | 'recrawl': 'reload',
32 | 'reloadIfOlderNumber': 0,
33 | 'reloadIfOlderUnit': 'day',
34 | 'storeHTCache': 'on',
35 | 'xsstopw': 'on',
36 | 'priority': 0
37 | }
38 |
39 | res = requests.get('http://localhost:8300/yacy/grid/crawler/crawlStart.json', params=data)
40 |
41 | if res.status_code != 200:
42 | print("ERR :: error starting the crawler")
43 | print(res.text)
44 | else:
45 | print("INF :: successfully sent '{0}' to crawler".format(crawlingURL))
46 |
--------------------------------------------------------------------------------
/bin/start_crawler_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd "`dirname $0`"
3 |
4 | bindhost="127.0.0.1"
5 | callhost="localhost"
6 | appname="YaCy Grid Crawler"
7 | containername=yacy-grid-crawler
8 | imagename=${containername//-/_}
9 | dockerfile="Dockerfile"
10 | production=false
11 | open=false
12 |
13 | usage() { echo "usage: $0 [-o | --open | -p | --production | --arm32 | --arm64 ]" 1>&2; exit 1; }
14 |
15 | while [[ $# -gt 0 ]]; do
16 | case "$1" in
17 | -p | --production ) production=true; shift 1;;
18 | -o | --open ) open=true; shift 1;;
19 | --arm32 ) imagename=${imagename}:arm32; dockerfile=${dockerfile}_arm32; shift 1;;
20 | --arm64 ) imagename=${imagename}:arm64; dockerfile=${dockerfile}_arm64; shift 1;;
21 | -h | --help | -* | --* | * ) usage;;
22 | esac
23 | done
24 | if [ "$production" = true ] ; then imagename="yacy/${imagename}"; fi
25 | if [ "$open" = true ] ; then bindhost="0.0.0.0"; callhost=`hostname`; fi
26 |
27 | containerRuns=$(docker ps | grep -i "${containername}" | wc -l )
28 | containerExists=$(docker ps -a | grep -i "${containername}" | wc -l )
29 | if [ ${containerRuns} -gt 0 ]; then
30 | echo "${appname} container is already running"
31 | elif [ ${containerExists} -gt 0 ]; then
32 | docker start ${containername}
33 | echo "${appname} container re-started"
34 | else
35 | if [[ $imagename != "yacy/"*":latest" ]] && [[ "$(docker images -q ${imagename} 2> /dev/null)" == "" ]]; then
36 | cd ..
37 | docker build -t ${imagename} -f ${dockerfile} .
38 | cd bin
39 | fi
40 | docker run -d --restart=unless-stopped -p ${bindhost}:8300:8300 \
41 | --link yacy-grid-minio --link yacy-grid-rabbitmq --link yacy-grid-elasticsearch --link yacy-grid-mcp \
42 | -e YACYGRID_GRID_MCP_ADDRESS=yacy-grid-mcp \
43 | --name ${containername} ${imagename}
44 | echo "${appname} started."
45 | fi
46 | docker ps -a --format "table {{.ID}}\t{{.Image}}\t{{.Names}}\t{{.Mounts}}\t{{.Ports}}"
47 |
48 | echo "To get the app status, open http://${callhost}:8300/yacy/grid/mcp/info/status.json"
49 |
--------------------------------------------------------------------------------
/conf/config.properties:
--------------------------------------------------------------------------------
1 | port = 8300
2 | grid.mcp.address = 127.0.0.1:8100,node00.local:8100,brain.local:8100,searchlab.eu:8100
3 | grid.broker.lazy = true
4 | grid.broker.queue.limit = 0
5 | grid.broker.queue.throttling = 100000
6 | grid.assets.delete = true
7 |
8 | # The blacklist is choosen with the attribute grid.crawler.blacklist which gives the file name(s) of the blacklist(s) to be used.
9 | # To use your own blacklist, create a file in data/crawler-8300/conf/ and set the name of it
10 | # in the attribute grid.crawler.blacklist.
11 | #
12 | # You can use several blacklists simultanously, just comma-separate the names of the file names.
13 | # all files in the path conf/ and data/crawler-8300/conf/ are found.
14 | # The same applies to files in parallel processes like data/crawler-8301/conf/ and so on.
15 | #
16 | # The file format of the blacklist is:
17 | # - it is a plain text file in UTF-8 encoding
18 | # - every line beginning with '#' is a comment and is ignored
19 | # - every string, matching with ' #.*' is removed. This cuts away comments from the end of a line.
20 | # - every blank line is ignored
21 | # - every other line must contain a regular expression according to
22 | # https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
23 | # which is considered as a matcher pattern (not a find pattern) for an URL.
24 | # Lines containing a regular expression get their leading and trailing spaces removed.
25 | #
26 | # All regular expressions are considered to be a disjunction (OR logic) for the filtering of cralwing urls.
27 | # URLs are normalized before a matching is attempted, that means they are encoded propery
28 | # and the fragment identifier is removed from the end of the URL.
29 | grid.crawler.blacklist = crawler_blacklist_someonewhocares.txt,crawler_blacklist_localhost.txt
30 | grid.indexer.blacklist = indexer_blacklist_filetypes.txt
31 | grid.indexer.priorityQueues = 2
32 |
33 |
34 |
35 | ####################################################################
36 | ## The following properties must be identical to those in the MCP ##
37 | ####################################################################
38 |
39 | # The grid name is used to separate different grid networks.
40 | # Only networks with the same name connect with each other
41 | grid.name = freeworld
42 |
43 | # Index names of the grid indexes:
44 | # crawlstart : a history of all crawl starts
45 | # crawler : tracking of crawling progress
46 | # query : a history of all queries
47 | # web : the document search index ("web index", there)
48 | grid.elasticsearch.indexName.crawlstart = crawlstart
49 | grid.elasticsearch.indexName.crawler = crawler
50 | grid.elasticsearch.indexName.query = query
51 | grid.elasticsearch.indexName.web = web
52 |
53 | # the following type name is an intermediate solution to migrate from elastic 6.x to 8.x
54 | # unfortunately the current index type name is 'web' but in future elastic versions the name '_doc'
55 | # is mandatory. We will use this setting until migration to elastic 8.x is complete and delete
56 | # the configuration afterwards.
57 | grid.elasticsearch.typeName = web
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/api/CrawlerDefaultValuesService.java:
--------------------------------------------------------------------------------
1 | /**
2 | * CrawlerDefaultValuesService
3 | * Copyright 04.6.2017 by Michael Peter Christen, @0rb1t3r
4 | *
5 | * This library is free software; you can redistribute it and/or
6 | * modify it under the terms of the GNU Lesser General Public
7 | * License as published by the Free Software Foundation; either
8 | * version 2.1 of the License, or (at your option) any later version.
9 | *
10 | * This library is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | * Lesser General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU Lesser General Public License
16 | * along with this program in the file lgpl21.txt
17 | * If not, see .
18 | */
19 |
20 | package net.yacy.grid.crawler.api;
21 |
22 | import javax.servlet.http.HttpServletResponse;
23 |
24 | import org.json.JSONObject;
25 |
26 | import net.yacy.grid.contracts.User;
27 | import net.yacy.grid.http.APIHandler;
28 | import net.yacy.grid.http.ObjectAPIHandler;
29 | import net.yacy.grid.http.Query;
30 | import net.yacy.grid.http.ServiceResponse;
31 |
32 | /**
33 | *
34 | * Test URL:
35 | * http://localhost:8300/yacy/grid/crawler/defaultValues.json
36 | *
37 | * Test command:
38 | * curl http://localhost:8300/yacy/grid/crawler/defaultValues.json
39 | */
40 | public class CrawlerDefaultValuesService extends ObjectAPIHandler implements APIHandler {
41 |
42 | private static final long serialVersionUID = 8578474303031749879L;
43 | public static final String NAME = "defaultValues";
44 |
45 | public static JSONObject defaultValues = new JSONObject(true);
46 | static {
47 | defaultValues.put("crawlingMode", "url");
48 | defaultValues.put("crawlingURL", "");
49 | defaultValues.put("sitemapURL", "");
50 | defaultValues.put("crawlingFile", "");
51 | defaultValues.put("crawlingDepth", 3);
52 | defaultValues.put("crawlingDepthExtension", "");
53 | defaultValues.put("range", "domain");
54 | defaultValues.put("mustmatch", ".*");
55 | defaultValues.put("mustnotmatch", ".*\\.(js|css|jpg|jpeg|png|dmg|mpg|mpeg|zip|gz|exe|pkg)");
56 | defaultValues.put("ipMustmatch", ".*");
57 | defaultValues.put("ipMustnotmatch", "");
58 | defaultValues.put("indexmustmatch", ".*");
59 | defaultValues.put("indexmustnotmatch", "");
60 | defaultValues.put("deleteold", "off");
61 | defaultValues.put("deleteIfOlderNumber", 0);
62 | defaultValues.put("deleteIfOlderUnit", "day");
63 | defaultValues.put("recrawl", "nodoubles");
64 | defaultValues.put("reloadIfOlderNumber", 0);
65 | defaultValues.put("reloadIfOlderUnit", "day");
66 | defaultValues.put("crawlingDomMaxCheck", "off");
67 | defaultValues.put("crawlingDomMaxPages", 1000);
68 | defaultValues.put("crawlingQ", "off");
69 | defaultValues.put("cachePolicy", "if fresh");
70 | defaultValues.put("collection", "user"); // corpus name
71 | defaultValues.put("agentName", "");
72 | defaultValues.put("priority", 0);
73 | defaultValues.put("loaderHeadless", "false");
74 | defaultValues.put("user_id", User.ANONYMOUS_ID);
75 | defaultValues.put("storeAssets", "false");
76 | defaultValues.put("archiveWARC", "false");
77 | defaultValues.put("archiveIndex", "false");
78 | defaultValues.put("archiveGraph", "false");
79 | }
80 |
81 | @Override
82 | public String getAPIPath() {
83 | return "/yacy/grid/crawler/" + NAME + ".json";
84 | }
85 |
86 | public static JSONObject crawlStartDefaultClone() {
87 | final JSONObject json = new JSONObject(true);
88 | defaultValues.keySet().forEach(key -> json.put(key, defaultValues.get(key)));
89 | return json;
90 | }
91 |
92 | @Override
93 | public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) {
94 | return new ServiceResponse(defaultValues);
95 | }
96 |
97 | }
98 |
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # YaCy Grid Component: Crawler
2 |
3 | The YaCy Grid is the second-generation implementation of YaCy, a peer-to-peer search engine.
4 | A YaCy Grid installation consists of a set of micro-services which communicate with each other
5 | using the MCP, see https://github.com/yacy/yacy_grid_mcp
6 |
7 | ## Purpose
8 |
9 | The Crawler is a microservices which can be deployed i.e. using Docker. When the Crawler Component
10 | is started, it searches for a MCP and connect to it. By default the local host is searched for a
11 | MCP but you can configure one yourself.
12 |
13 | ## What it does
14 |
15 | The Crawler then does the following:
16 |
17 | ```
18 | while (a Crawl Contract is in the queue crawler_pending) do
19 | - read the target url from the contract
20 | - check against the search index if the url is registered in the transaction index as 'to-be-parsed'. If not, continue
21 | - load the url content from the assets (it must have been loaded before! - that is another process)
22 | - parse the content and create a YaCy JSON object with that content
23 | - place the YaCy JSON within a contract in the index_pending queue
24 | - extract all links from the YaCy JSON
25 | - check the validity of the links using the crawl contract
26 | - all remaining urls are checked against the transaction index, all existing urls are discarded
27 | - write an index entry for the remaining urls with status 'to-be-loaded'
28 | - and these remaining urls are placed onto the loader_pending queue
29 | - the status of the target url is set to to-be-indexed
30 | od
31 | ```
32 | ## Required Infrastructure (Search Index, Asset Storage and Message Queues)
33 |
34 | This requires an transaction index with the following information:
35 | * `URL` (as defined with https://tools.ietf.org/html/rfc3986)
36 | * `crawlid` (a hash)
37 | * status (`to-be-loaded`, `to-be-parsed`, `to-be-indexed`, `indexed`)
38 | As long as a crawl process is running, new urls (as discovered in the html source of a target url)
39 | must be written to the transaction index before the target url has a status change (from to-be-parsed to to-be-indexed).
40 | This makes it possible that the status of a crawl job and the fact that it has been terminted can be
41 | discovered from the transaction index.
42 | * if all status entries for a single `crawlid` are `indexed` then the crawl has been terminated.
43 | The Crawl process needs another database index, which contains the crawl description. The content must be almost the same as
44 | describe in http://www.yacy-websuche.de/wiki/index.php/Dev:APICrawler
45 |
46 | Every loader and parser microservice must read this crawl profile information. Because that information is required
47 | many times, we omit a request into the cawler index by adding the crawler profile into each contract of a crawl job in the
48 | crawler_pending and loader_pending queue.
49 |
50 | The crawl is therefore controlled by those queues:
51 | * `loader_pending` queue: entries which the yacy_grid_loader process reads. This process loads given resources and writes them to the asset storage.
52 | * `crawler_pending`queue: entries which the yacy_grid_crawler process reads. This process loads the content from the asset storage, parses the content and creates new loader_pending tasks.
53 |
54 | The required indexes are:
55 | * a crawl profile index
56 | * a transaction index which reflects the crawl status
57 | * a search index
58 |
59 | The microservices will create these indexes on their own using the MCP component.
60 |
61 | ## Installation: Download, Build, Run
62 | At this time, yacy_grid_crawler is not provided in compiled form, you easily build it yourself. It's not difficult and done in one minute! The source code is hosted at https://github.com/yacy/yacy_grid_crawler, you can download it and run loklak with:
63 |
64 | > git clone --recursive https://github.com/yacy/yacy_grid_crawler.git
65 |
66 | If you just want to make a update, do the following
67 |
68 | > git pull origin master
69 | > git submodule foreach git pull origin master
70 |
71 | To build and start the crawler, run
72 |
73 | > cd yacy_grid_crawler
74 | > gradle run
75 |
76 | Please read also https://github.com/yacy/yacy_grid_mcp/edit/master/README.md for further details.
77 |
78 | ## Contribute
79 |
80 | This is a community project and your contribution is welcome!
81 |
82 | 1. Check for [open issues](https://github.com/yacy/yacy_grid_crawler/issues)
83 | or open a fresh one to start a discussion around a feature idea or a bug.
84 | 2. Fork [the repository](https://github.com/yacy/yacy_grid_crawler.git)
85 | on GitHub to start making your changes (branch off of the master branch).
86 | 3. Write a test that shows the bug was fixed or the feature works as expected.
87 | 4. Send a pull request and bug us on Gitter until it gets merged and published. :)
88 |
89 | ## What is the software license?
90 | LGPL 2.1
91 |
92 | Have fun!
93 |
94 | @0rb1t3r
95 |
--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/Blacklist.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Blacklist
3 | * Copyright 17.02.2018 by Michael Peter Christen, @0rb1t3r
4 | *
5 | * This library is free software; you can redistribute it and/or
6 | * modify it under the terms of the GNU Lesser General Public
7 | * License as published by the Free Software Foundation; either
8 | * version 2.1 of the License, or (at your option) any later version.
9 | *
10 | * This library is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | * Lesser General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU Lesser General Public License
16 | * along with this program in the file lgpl21.txt
17 | * If not, see .
18 | */
19 |
20 |
21 | package net.yacy.grid.crawler;
22 |
23 | import java.io.File;
24 | import java.io.IOException;
25 | import java.nio.charset.StandardCharsets;
26 | import java.nio.file.Files;
27 | import java.util.ArrayList;
28 | import java.util.List;
29 | import java.util.concurrent.atomic.AtomicInteger;
30 | import java.util.regex.Matcher;
31 | import java.util.regex.Pattern;
32 | import java.util.regex.PatternSyntaxException;
33 |
34 | import net.yacy.grid.tools.ARC;
35 | import net.yacy.grid.tools.HashARC;
36 | import net.yacy.grid.tools.Logger;
37 | import net.yacy.grid.tools.MultiProtocolURL;
38 |
39 | /**
40 | * A blacklist class to test if an URL is blacklisted.
41 | * This class has no object synchronization and it must not be used in concurrent environment.
42 | * The lack of concurrency is done on purpose. Each concurrent thread must initialize it's own blacklist.
43 | * This ensures that no concurrency issue appears between threads using the same blacklist.
44 | */
45 | public class Blacklist {
46 |
47 | private final ARC blacklistHitCache;
48 | private final ARC blacklistMissCache;
49 | private final List blacklist;
50 |
51 | public Blacklist() {
52 | this.blacklist = new ArrayList<>();
53 | this.blacklistHitCache = new HashARC<>(100000);
54 | this.blacklistMissCache = new HashARC<>(100000);
55 | }
56 |
57 | public void load(File f) throws IOException {
58 | final AtomicInteger counter = new AtomicInteger(0);
59 | Files.lines(f.toPath(), StandardCharsets.UTF_8).forEach(line -> {
60 | line = line.trim();
61 | int p = line.indexOf(" #");
62 | String info = "";
63 | if (p >= 0) {
64 | info = line.substring(p + 1).trim();
65 | line = line.substring(0, p);
66 | }
67 | line = line.trim();
68 | if (!line.isEmpty() && !line.startsWith("#")) {
69 | if (line.startsWith("host ")) {
70 | String host = line.substring(5).trim();
71 | try {
72 | BlacklistInfo bi = new BlacklistInfo(".*?//" + host + "/.*+", f.getName(), info, host);
73 | this.blacklist.add(bi);
74 | counter.incrementAndGet();
75 | } catch (PatternSyntaxException e) {
76 | Logger.warn(this.getClass(), "regex for host in file " + f.getName() + " cannot be compiled: " + line.substring(5).trim());
77 | }
78 | } else {
79 | try {
80 | BlacklistInfo bi = new BlacklistInfo(line, f.getName(), info, null);
81 | this.blacklist.add(bi);
82 | counter.incrementAndGet();
83 | } catch (PatternSyntaxException e) {
84 | Logger.warn(this.getClass(), "regex for url in file " + f.getName() + " cannot be compiled: " + line);
85 | }
86 | }
87 | }
88 | });
89 | Logger.info(this.getClass(), "loaded " + counter.get() + " blacklist entries from file " + f.getName());
90 | }
91 |
92 | public final static class BlacklistInfo {
93 | public final Matcher matcher;
94 | public final String source;
95 | public final String info;
96 | public final String host;
97 | public BlacklistInfo(final String patternString, final String source, final String info, final String host) throws PatternSyntaxException {
98 | this.matcher = Pattern.compile(patternString).matcher("");
99 | this.source = source;
100 | this.info = info;
101 | this.host = host;
102 | }
103 | }
104 |
105 | public BlacklistInfo isBlacklisted(String url, MultiProtocolURL u) {
106 | BlacklistInfo cachedBI = this.blacklistHitCache.get(url);
107 | if (cachedBI != null) return cachedBI;
108 | Boolean cachedMiss = this.blacklistMissCache.get(url);
109 | if (cachedMiss != null) return null;
110 | for (BlacklistInfo bi: this.blacklist) {
111 | if (u != null && bi.host != null) {
112 | if (u.getHost().equals(bi.host)) {
113 | return bi;
114 | }
115 | } else {
116 | bi.matcher.reset(url);
117 | //Thread.currentThread().setName(bi.matcher.pattern().pattern() + " -> " + url);
118 | if (bi.matcher.matches()) {
119 | this.blacklistHitCache.put(url, bi);
120 | return bi;
121 | }
122 | }
123 | }
124 | this.blacklistMissCache.put(url, Boolean.TRUE);
125 | return null;
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/Crawler.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Crawler
3 | * Copyright 25.04.2017 by Michael Peter Christen, @0rb1t3r
4 | *
5 | * This library is free software; you can redistribute it and/or
6 | * modify it under the terms of the GNU Lesser General Public
7 | * License as published by the Free Software Foundation; either
8 | * version 2.1 of the License, or (at your option) any later version.
9 | *
10 | * This library is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | * Lesser General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU Lesser General Public License
16 | * along with this program in the file lgpl21.txt
17 | * If not, see .
18 | */
19 |
20 | package net.yacy.grid.crawler;
21 |
22 | import java.util.ArrayList;
23 | import java.util.Arrays;
24 | import java.util.List;
25 |
26 | import javax.servlet.Servlet;
27 |
28 | import net.yacy.grid.YaCyServices;
29 | import net.yacy.grid.crawler.api.CrawlStartService;
30 | import net.yacy.grid.crawler.api.CrawlerDefaultValuesService;
31 | import net.yacy.grid.mcp.BrokerListener;
32 | import net.yacy.grid.mcp.Configuration;
33 | import net.yacy.grid.mcp.MCP;
34 | import net.yacy.grid.mcp.Service;
35 | import net.yacy.grid.tools.CronBox;
36 | import net.yacy.grid.tools.CronBox.Telemetry;
37 | import net.yacy.grid.tools.Logger;
38 |
39 | /**
40 | * The Crawler main class
41 | *
42 | * performance debugging:
43 | * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt
44 | * http://localhost:8300/yacy/grid/mcp/info/threaddump.txt?count=100
45 | */
46 | public class Crawler {
47 |
48 | private final static YaCyServices CRAWLER_SERVICE = YaCyServices.crawler;
49 | private final static String DATA_PATH = "data";
50 |
51 | // define services
52 | @SuppressWarnings("unchecked")
53 | public final static Class extends Servlet>[] CRAWLER_SERVICES = new Class[]{
54 | CrawlerDefaultValuesService.class,
55 | CrawlStartService.class
56 | };
57 |
58 | public static class Application implements CronBox.Application {
59 |
60 | final Configuration config;
61 | final Service service;
62 | final BrokerListener brokerApplication;
63 | final CronBox.Application serviceApplication;
64 |
65 | public Application() {
66 | Logger.info("Starting Crawler Application...");
67 |
68 | // initialize configuration
69 | final List> services = new ArrayList<>();
70 | services.addAll(Arrays.asList(MCP.MCP_SERVLETS));
71 | services.addAll(Arrays.asList(CRAWLER_SERVICES));
72 | this.config = new Configuration(DATA_PATH, true, CRAWLER_SERVICE, services.toArray(new Class[services.size()]));
73 | final int priorityQueues = Integer.parseInt(this.config.properties.get("grid.indexer.priorityQueues"));
74 | CrawlerListener.initPriorityQueue(priorityQueues);
75 |
76 | // initialize REST server with services
77 | this.service = new Service(this.config);
78 |
79 | // connect backend
80 | this.config.connectBackend();
81 |
82 | // initiate broker application: listening to indexing requests at RabbitMQ
83 | this.brokerApplication = new CrawlerListener(this.config, CRAWLER_SERVICE);
84 |
85 | // initiate service application: listening to REST request
86 | this.serviceApplication = this.service.newServer(null);
87 | }
88 |
89 | @Override
90 | public void run() {
91 |
92 | Logger.info("Grid Name: " + this.config.properties.get("grid.name"));
93 |
94 | // starting threads
95 | new Thread(this.brokerApplication).start();
96 | this.serviceApplication.run(); // SIC! the service application is running as the core element of this run() process. If we run it concurrently, this runnable will be "dead".
97 | }
98 |
99 | @Override
100 | public void stop() {
101 | Logger.info("Stopping Crawler Application...");
102 | this.serviceApplication.stop();
103 | this.brokerApplication.stop();
104 | this.service.stop();
105 | this.service.close();
106 | this.config.close();
107 | }
108 |
109 | @Override
110 | public Telemetry getTelemetry() {
111 | return null;
112 | }
113 |
114 | }
115 |
116 | public static void main(final String[] args) {
117 | // run in headless mode
118 | System.setProperty("java.awt.headless", "true"); // no awt used here so we can switch off that stuff
119 |
120 | // Debug Info
121 | boolean assertionenabled = false;
122 | assert (assertionenabled = true) == true; // compare to true to remove warning: "Possible accidental assignement"
123 | if (assertionenabled) Logger.info("Asserts are enabled");
124 |
125 | // first greeting
126 | Logger.info("YaCy Grid Crawler started!");
127 |
128 | // run application with cron
129 | final long cycleDelay = Long.parseLong(System.getProperty("YACYGRID_CRAWLER_CYCLEDELAY", "" + Long.MAX_VALUE)); // by default, run only in one genesis thread
130 | final int cycleRandom = Integer.parseInt(System.getProperty("YACYGRID_CRAWLER_CYCLERANDOM", "" + 1000 * 60 /*1 minute*/));
131 | final CronBox cron = new CronBox(Application.class, cycleDelay, cycleRandom);
132 | cron.cycle();
133 |
134 | // this line is reached if the cron process was shut down
135 | Logger.info("YaCy Grid Crawler terminated");
136 | }
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | #
4 | # Copyright © 2015-2021 the original authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | #
21 | # Gradle start up script for POSIX generated by Gradle.
22 | #
23 | # Important for running:
24 | #
25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
26 | # noncompliant, but you have some other compliant shell such as ksh or
27 | # bash, then to run this script, type that shell name before the whole
28 | # command line, like:
29 | #
30 | # ksh Gradle
31 | #
32 | # Busybox and similar reduced shells will NOT work, because this script
33 | # requires all of these POSIX shell features:
34 | # * functions;
35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»;
37 | # * compound commands having a testable exit status, especially «case»;
38 | # * various built-in commands including «command», «set», and «ulimit».
39 | #
40 | # Important for patching:
41 | #
42 | # (2) This script targets any POSIX shell, so it avoids extensions provided
43 | # by Bash, Ksh, etc; in particular arrays are avoided.
44 | #
45 | # The "traditional" practice of packing multiple parameters into a
46 | # space-separated string is a well documented source of bugs and security
47 | # problems, so this is (mostly) avoided, by progressively accumulating
48 | # options in "$@", and eventually passing that to Java.
49 | #
50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
52 | # see the in-line comments for details.
53 | #
54 | # There are tweaks for specific operating systems such as AIX, CygWin,
55 | # Darwin, MinGW, and NonStop.
56 | #
57 | # (3) This script is generated from the Groovy template
58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
59 | # within the Gradle project.
60 | #
61 | # You can find Gradle at https://github.com/gradle/gradle/.
62 | #
63 | ##############################################################################
64 |
65 | # Attempt to set APP_HOME
66 |
67 | # Resolve links: $0 may be a link
68 | app_path=$0
69 |
70 | # Need this for daisy-chained symlinks.
71 | while
72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
73 | [ -h "$app_path" ]
74 | do
75 | ls=$( ls -ld "$app_path" )
76 | link=${ls#*' -> '}
77 | case $link in #(
78 | /*) app_path=$link ;; #(
79 | *) app_path=$APP_HOME$link ;;
80 | esac
81 | done
82 |
83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
84 |
85 | APP_NAME="Gradle"
86 | APP_BASE_NAME=${0##*/}
87 |
88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
90 |
91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
92 | MAX_FD=maximum
93 |
94 | warn () {
95 | echo "$*"
96 | } >&2
97 |
98 | die () {
99 | echo
100 | echo "$*"
101 | echo
102 | exit 1
103 | } >&2
104 |
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in #(
111 | CYGWIN* ) cygwin=true ;; #(
112 | Darwin* ) darwin=true ;; #(
113 | MSYS* | MINGW* ) msys=true ;; #(
114 | NONSTOP* ) nonstop=true ;;
115 | esac
116 |
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 |
119 |
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 | # IBM's JDK on AIX uses strange locations for the executables
124 | JAVACMD=$JAVA_HOME/jre/sh/java
125 | else
126 | JAVACMD=$JAVA_HOME/bin/java
127 | fi
128 | if [ ! -x "$JAVACMD" ] ; then
129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 |
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 | fi
134 | else
135 | JAVACMD=java
136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 |
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 |
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 | case $MAX_FD in #(
145 | max*)
146 | MAX_FD=$( ulimit -H -n ) ||
147 | warn "Could not query maximum file descriptor limit"
148 | esac
149 | case $MAX_FD in #(
150 | '' | soft) :;; #(
151 | *)
152 | ulimit -n "$MAX_FD" ||
153 | warn "Could not set maximum file descriptor limit to $MAX_FD"
154 | esac
155 | fi
156 |
157 | # Collect all arguments for the java command, stacking in reverse order:
158 | # * args from the command line
159 | # * the main class name
160 | # * -classpath
161 | # * -D...appname settings
162 | # * --module-path (only if needed)
163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
164 |
165 | # For Cygwin or MSYS, switch paths to Windows format before running java
166 | if "$cygwin" || "$msys" ; then
167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
169 |
170 | JAVACMD=$( cygpath --unix "$JAVACMD" )
171 |
172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
173 | for arg do
174 | if
175 | case $arg in #(
176 | -*) false ;; # don't mess with options #(
177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
178 | [ -e "$t" ] ;; #(
179 | *) false ;;
180 | esac
181 | then
182 | arg=$( cygpath --path --ignore --mixed "$arg" )
183 | fi
184 | # Roll the args list around exactly as many times as the number of
185 | # args, so each arg winds up back in the position where it started, but
186 | # possibly modified.
187 | #
188 | # NB: a `for` loop captures its iteration list before it begins, so
189 | # changing the positional parameters here affects neither the number of
190 | # iterations, nor the values presented in `arg`.
191 | shift # remove old arg
192 | set -- "$@" "$arg" # push replacement arg
193 | done
194 | fi
195 |
196 | # Collect all arguments for the java command;
197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
198 | # shell script including quotes and variable substitutions, so put them in
199 | # double quotes to make sure that they get re-expanded; and
200 | # * put everything else in single quotes, so that it's not re-expanded.
201 |
202 | set -- \
203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \
204 | -classpath "$CLASSPATH" \
205 | org.gradle.wrapper.GradleWrapperMain \
206 | "$@"
207 |
208 | # Use "xargs" to parse quoted args.
209 | #
210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
211 | #
212 | # In Bash we could simply go:
213 | #
214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) &&
215 | # set -- "${ARGS[@]}" "$@"
216 | #
217 | # but POSIX shell has neither arrays nor command substitution, so instead we
218 | # post-process each arg (as a line of input to sed) to backslash-escape any
219 | # character that might be a shell metacharacter, then use eval to reverse
220 | # that process (while maintaining the separation between arguments), and wrap
221 | # the whole thing up as a single "set" statement.
222 | #
223 | # This will of course break if any of these variables contains a newline or
224 | # an unmatched quote.
225 | #
226 |
227 | eval "set -- $(
228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
229 | xargs -n1 |
230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
231 | tr '\n' ' '
232 | )" '"$@"'
233 |
234 | exec "$JAVACMD" "$@"
235 |
--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/api/CrawlStartService.java:
--------------------------------------------------------------------------------
1 | /**
2 | * CrawlStartService
3 | * Copyright 12.6.2017 by Michael Peter Christen, @0rb1t3r
4 | *
5 | * This library is free software; you can redistribute it and/or
6 | * modify it under the terms of the GNU Lesser General Public
7 | * License as published by the Free Software Foundation; either
8 | * version 2.1 of the License, or (at your option) any later version.
9 | *
10 | * This library is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | * Lesser General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU Lesser General Public License
16 | * along with this program in the file lgpl21.txt
17 | * If not, see .
18 | */
19 |
20 | package net.yacy.grid.crawler.api;
21 |
22 | import java.io.IOException;
23 | import java.nio.charset.StandardCharsets;
24 | import java.util.Date;
25 | import java.util.Map;
26 | import java.util.regex.Pattern;
27 |
28 | import javax.servlet.http.HttpServletResponse;
29 |
30 | import org.json.JSONArray;
31 | import org.json.JSONObject;
32 |
33 | import ai.susi.mind.SusiAction;
34 | import ai.susi.mind.SusiThought;
35 | import net.yacy.grid.YaCyServices;
36 | import net.yacy.grid.contracts.User;
37 | import net.yacy.grid.crawler.CrawlerListener;
38 | import net.yacy.grid.http.APIHandler;
39 | import net.yacy.grid.http.ObjectAPIHandler;
40 | import net.yacy.grid.http.Query;
41 | import net.yacy.grid.http.ServiceResponse;
42 | import net.yacy.grid.io.index.CrawlerMapping;
43 | import net.yacy.grid.io.index.CrawlstartDocument;
44 | import net.yacy.grid.io.index.CrawlstartMapping;
45 | import net.yacy.grid.io.index.GridIndex;
46 | import net.yacy.grid.io.index.Index.QueryLanguage;
47 | import net.yacy.grid.io.index.WebMapping;
48 | import net.yacy.grid.io.messages.GridQueue;
49 | import net.yacy.grid.io.messages.ShardingMethod;
50 | import net.yacy.grid.mcp.Service;
51 | import net.yacy.grid.tools.Digest;
52 | import net.yacy.grid.tools.Domains;
53 | import net.yacy.grid.tools.JSONList;
54 | import net.yacy.grid.tools.Logger;
55 | import net.yacy.grid.tools.MultiProtocolURL;
56 |
57 | /**
58 | *
59 | * Test URL:
60 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=yacy.net&indexmustnotmatch=.*Mitmachen.*&mustmatch=.*yacy.net.*
61 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=ix.de&crawlingDepth=6&priority=true
62 | * http://localhost:8300/yacy/grid/crawler/crawlStart.json?crawlingURL=tagesschau.de&loaderHeadless=false
63 | *
64 | * then check crawl queue status at http://localhost:15672/
65 | * default account is guest:guest
66 | */
67 | public class CrawlStartService extends ObjectAPIHandler implements APIHandler {
68 |
69 | private static final long serialVersionUID = 8578474303031749879L;
70 | public static final String NAME = "crawlStart";
71 |
72 | @Override
73 | public String getAPIPath() {
74 | return "/yacy/grid/crawler/" + NAME + ".json";
75 | }
76 |
77 | @Override
78 | public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) {
79 | final JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
80 |
81 | // read call attributes using the default crawlstart key names
82 | for (final String key: crawlstart.keySet()) {
83 | final Object object = crawlstart.get(key);
84 | if (object instanceof String) crawlstart.put(key, call.get(key, crawlstart.getString(key)));
85 | else if (object instanceof Integer) crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
86 | else if (object instanceof Long) crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
87 | else if (object instanceof JSONArray) {
88 | final JSONArray a = crawlstart.getJSONArray(key);
89 | final Object cv = call.get(key);
90 | if (cv != null) crawlstart.put(key, cv);
91 | } else {
92 | System.out.println("unrecognized type: " + object.getClass().toString());
93 | }
94 | }
95 | final String user_id = crawlstart.optString("user_id", User.ANONYMOUS_ID);
96 |
97 | // fix attributes
98 | final int crawlingDepth = crawlstart.optInt("crawlingDepth", 3);
99 | crawlstart.put("crawlingDepth", Math.min(crawlingDepth, 8)); // crawlingDepth shall not exceed 8 - this is used for enhanced balancing to be able to reach crawl leaves
100 | final String mustmatch = crawlstart.optString("mustmatch", CrawlerDefaultValuesService.defaultValues.getString("mustmatch")).trim();
101 | crawlstart.put("mustmatch", mustmatch);
102 | final Map collections = WebMapping.collectionParser(crawlstart.optString("collection").trim());
103 |
104 | // set the crawl id
105 | final CrawlerListener.CrawlstartURLSplitter crawlstartURLs = new CrawlerListener.CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
106 | final Date now = new Date();
107 | // start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
108 | final SusiThought allCrawlstarts = new SusiThought();
109 | int count = 0;
110 | for (final MultiProtocolURL url: crawlstartURLs.getURLs()) {
111 | final JSONObject singlecrawl = new JSONObject();
112 | for (final String key: crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key)); // create a clone of crawlstart
113 | final String crawl_id = CrawlerListener.getCrawlID(url, now, count++);
114 | final String start_url = url.toNormalform(true);
115 | final String start_ssld = Domains.getSmartSLD(url.getHost());
116 | singlecrawl.put("id", crawl_id);
117 | singlecrawl.put("user_id", user_id);
118 | singlecrawl.put("start_url", start_url);
119 | singlecrawl.put("start_ssld", start_ssld);
120 |
121 | //singlecrawl.put("crawlingURLs", new JSONArray().put(url.toNormalform(true)));
122 |
123 | try {
124 | // Create a crawlstart index entry: this will keep track of all crawls that have been started.
125 | // once such an entry is created, it is never changed or deleted again by any YaCy Grid process.
126 | final CrawlstartDocument crawlstartDoc = new CrawlstartDocument()
127 | .setCrawlID(crawl_id)
128 | .setUserID(user_id)
129 | .setMustmatch(mustmatch)
130 | .setCollections(collections.keySet())
131 | .setCrawlstartURL(start_url)
132 | .setCrawlstartSSLD(start_ssld)
133 | .setInitDate(now)
134 | .setData(singlecrawl);
135 | crawlstartDoc.store(Service.instance.config, Service.instance.config.gridIndex);
136 |
137 | // Create a crawler url tracking index entry: this will keep track of single urls and their status
138 | // While it is processed. The entry also serves as a double-check entry to terminate a crawl even if the
139 | // crawler is restarted.
140 |
141 | // delete the start url
142 | final String url_id = Digest.encodeMD5Hex(start_url);
143 | final String crawlerIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER);
144 | final String crawlstartIndexName = Service.instance.config.properties.getOrDefault("grid.elasticsearch.indexName.crawlstart", GridIndex.DEFAULT_INDEXNAME_CRAWLSTART);
145 | long deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"_id\":\"" + url_id + "\"}");
146 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for _id");
147 |
148 | // Because 'old' crawls may block new ones we identify possible blocking entries using the mustmatch pattern.
149 | // We therefore delete all entries with the same mustmatch pattern before a crawl starts.
150 | if (mustmatch.equals(".*")) {
151 | // we cannot delete all wide crawl status urls!
152 | final JSONList old_crawls = Service.instance.config.gridIndex.query(crawlstartIndexName, QueryLanguage.fields, "{ \"" + CrawlstartMapping.start_url_s.name() + "\":\"" + start_url + "\"}", 0, 100);
153 | // from there we pick out the crawl start id and delete using them
154 | for (final Object j: old_crawls.toArray()) {
155 | final String crawlid = ((JSONObject) j).optString(CrawlstartMapping.crawl_id_s.name());
156 | if (crawlid.length() > 0) {
157 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.crawl_id_s.name() + "\":\"" + crawlid + "\"}");
158 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for crawl_id_s");
159 | }
160 | }
161 | // we also delete all entries with same start_url and start_ssld
162 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_url_s.name() + "\":\"" + start_url + "\"}");
163 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_url_s");
164 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, "{ \"" + CrawlerMapping.start_ssld_s.name() + "\":\"" + start_ssld + "\"}");
165 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries for start_ssld_s");
166 | } else {
167 | // this should fit exactly on the old urls
168 | // test url:
169 | // curl -s -H 'Content-Type: application/json' -X GET http://localhost:9200/crawler/_search?q=_id:0a800a8ec1cc76b5eb8412ec494babc9 | python3 -m json.tool
170 | final String deletequery = "{ \"" + CrawlerMapping.mustmatch_s.name() + "\":\"" + mustmatch.replace("\\", "\\\\") + "\"}";
171 | deleted = Service.instance.config.gridIndex.delete(crawlerIndexName, QueryLanguage.fields, deletequery);
172 | Logger.info(this.getClass(), "deleted " + deleted + " old crawl index entries");
173 | }
174 | // we do not create a crawler document entry here because that would conflict with the double check.
175 | // crawler documents must be written after the double check has happened.
176 |
177 | // create a crawl queue entry
178 | final GridQueue queueName = Service.instance.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.BALANCE, CrawlerListener.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
179 | final SusiThought json = new SusiThought();
180 | json.setData(new JSONArray().put(singlecrawl));
181 | final JSONObject action = new JSONObject()
182 | .put("type", YaCyServices.crawler.name())
183 | .put("queue", queueName.name())
184 | .put("id", crawl_id)
185 | .put("user_id", user_id)
186 | .put("depth", 0)
187 | .put("sourcegraph", "rootasset");
188 | final SusiAction crawlAction = new SusiAction(action);
189 | final JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), start_url);
190 | crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
191 | json.addAction(crawlAction);
192 | allCrawlstarts.addAction(crawlAction);
193 | final byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
194 | Service.instance.config.gridBroker.send(YaCyServices.crawler, queueName, b);
195 |
196 | } catch (final IOException e) {
197 | Logger.warn(this.getClass(), "error when starting crawl for " + url.toNormalform(true), e);
198 | allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
199 | }
200 | }
201 |
202 | // construct a crawl start message
203 | allCrawlstarts.setData(new JSONArray().put(crawlstart));
204 | allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
205 |
206 | // finally add the crawl start on the queue
207 | return new ServiceResponse(allCrawlstarts);
208 | }
209 |
210 | }
211 |
212 |
--------------------------------------------------------------------------------
/src/main/java/net/yacy/grid/crawler/CrawlerListener.java:
--------------------------------------------------------------------------------
1 | package net.yacy.grid.crawler;
2 |
3 | import java.io.ByteArrayInputStream;
4 | import java.io.File;
5 | import java.io.IOException;
6 | import java.io.Serializable;
7 | import java.net.MalformedURLException;
8 | import java.nio.charset.StandardCharsets;
9 | import java.text.SimpleDateFormat;
10 | import java.util.ArrayList;
11 | import java.util.Collection;
12 | import java.util.Date;
13 | import java.util.HashMap;
14 | import java.util.HashSet;
15 | import java.util.Iterator;
16 | import java.util.List;
17 | import java.util.Locale;
18 | import java.util.Map;
19 | import java.util.Set;
20 | import java.util.concurrent.ConcurrentHashMap;
21 | import java.util.regex.Pattern;
22 |
23 | import org.json.JSONArray;
24 | import org.json.JSONObject;
25 |
26 | import ai.susi.mind.SusiAction;
27 | import ai.susi.mind.SusiThought;
28 | import net.yacy.grid.Services;
29 | import net.yacy.grid.YaCyServices;
30 | import net.yacy.grid.contracts.User;
31 | import net.yacy.grid.io.assets.Asset;
32 | import net.yacy.grid.io.index.CrawlerDocument;
33 | import net.yacy.grid.io.index.CrawlerDocument.Status;
34 | import net.yacy.grid.io.index.GridIndex;
35 | import net.yacy.grid.io.index.WebMapping;
36 | import net.yacy.grid.io.messages.GridQueue;
37 | import net.yacy.grid.io.messages.ShardingMethod;
38 | import net.yacy.grid.mcp.AbstractBrokerListener;
39 | import net.yacy.grid.mcp.BrokerListener;
40 | import net.yacy.grid.mcp.Configuration;
41 | import net.yacy.grid.tools.Classification.ContentDomain;
42 | import net.yacy.grid.tools.CronBox.Telemetry;
43 | import net.yacy.grid.tools.DateParser;
44 | import net.yacy.grid.tools.Digest;
45 | import net.yacy.grid.tools.JSONList;
46 | import net.yacy.grid.tools.Logger;
47 | import net.yacy.grid.tools.MultiProtocolURL;
48 |
49 |
50 | public class CrawlerListener extends AbstractBrokerListener implements BrokerListener {
51 |
52 | private final static String[] FIELDS_IN_GRAPH = new String[]{
53 | WebMapping.inboundlinks_sxt.name(),
54 | WebMapping.outboundlinks_sxt.name(),
55 | //WebMapping.images_sxt.name(),
56 | WebMapping.frames_sxt.name(),
57 | WebMapping.iframes_sxt.name()
58 | };
59 |
60 | private final static String PATTERN_TIMEF = "YYYYMMddHHmmssSSS";
61 |
62 | public static int[] CRAWLER_PRIORITY_DIMENSIONS = YaCyServices.crawler.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.crawler.getSourceQueues().length - 1, 1};
63 | private static int[] LOADER_PRIORITY_DIMENSIONS = YaCyServices.loader.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.loader.getSourceQueues().length - 1, 1};
64 | private static int[] PARSER_PRIORITY_DIMENSIONS = YaCyServices.parser.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.parser.getSourceQueues().length - 1, 1};
65 | private static int[] INDEXER_PRIORITY_DIMENSIONS = YaCyServices.indexer.getSourceQueues().length == 1 ? new int[] {1, 0} : new int[] {YaCyServices.indexer.getSourceQueues().length - 1, 1};
66 |
67 | static void initPriorityQueue(final int priorityDimension) {
68 | CRAWLER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.crawler, priorityDimension);
69 | LOADER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.loader, priorityDimension);
70 | PARSER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.parser, priorityDimension);
71 | INDEXER_PRIORITY_DIMENSIONS = priorityDimensions(YaCyServices.indexer, priorityDimension);
72 | }
73 |
74 | private static int[] priorityDimensions(final YaCyServices service, final int d) {
75 | return service.getSourceQueues().length <= d ? new int[] {service.getSourceQueues().length, 0} : new int[] {service.getSourceQueues().length - d, d};
76 | }
77 |
78 | private final String[] blacklist_crawler_names_list, blacklist_indexer_names_list;
79 | private final Map blacklists_crawler, blacklists_indexer;
80 |
81 | //private final static Map doubles = Service.hazelcast.getMap("doubles");
82 | private final Map doubles = new ConcurrentHashMap<>();
83 | private static long doublesLastCleanup = System.currentTimeMillis();
84 | private final static long doublesCleanupTimeout = 1000L * 60L * 60L * 24L * 7L; // cleanup after 7 days
85 | private final static long doublesCleanupPeriod = 1000L * 60L * 10L; // do cleanup each 10 minutes
86 | private static class DoubleCache implements Serializable {
87 | private static final long serialVersionUID = 614262945713636851L;
88 | public Set doubleHashes;
89 | public long time;
90 | public DoubleCache() {
91 | this.time = System.currentTimeMillis();
92 | this.doubleHashes = ConcurrentHashMap.newKeySet();
93 | }
94 | }
95 |
96 | private void doDoubleCleanup() {
97 | final long now = System.currentTimeMillis();
98 | if (now - doublesLastCleanup < doublesCleanupPeriod) return;
99 | doublesLastCleanup = now;
100 | final Iterator> i = this.doubles.entrySet().iterator();
101 | while (i.hasNext()) {
102 | final Map.Entry cache = i.next();
103 | if ((now - cache.getValue().time) > doublesCleanupTimeout) {
104 | cache.getValue().doubleHashes.clear();
105 | i.remove();
106 | }
107 | }
108 | }
109 |
110 | public static class CrawlstartURLSplitter {
111 |
112 | private final List crawlingURLArray;
113 | private final List badURLStrings;
114 |
115 | public CrawlstartURLSplitter(String crawlingURLsString) {
116 | Logger.info(this.getClass(), "splitting url list: " + crawlingURLsString);
117 | crawlingURLsString = crawlingURLsString.replaceAll("\\|http", "\nhttp").replaceAll("%7Chttp", "\nhttp").replaceAll("%0D%0A", "\n").replaceAll("%0A", "\n").replaceAll("%0D", "\n").replaceAll(" ", "\n");
118 | final String[] crawlingURLs = crawlingURLsString.split("\n");
119 | this.crawlingURLArray = new ArrayList<>();
120 | this.badURLStrings = new ArrayList<>();
121 | for (final String u: crawlingURLs) {
122 | try {
123 | final MultiProtocolURL url = new MultiProtocolURL(u);
124 | Logger.info(this.getClass(), "splitted url: " + url.toNormalform(true));
125 | this.crawlingURLArray.add(url);
126 | } catch (final MalformedURLException e) {
127 | this.badURLStrings.add(u);
128 | Logger.warn(this.getClass(), "error when starting crawl with splitter url " + u + "; splitted from " + crawlingURLsString, e);
129 | }
130 | }
131 | }
132 |
133 | public List getURLs() {
134 | return this.crawlingURLArray;
135 | }
136 |
137 | public List getBadURLs() {
138 | return this.badURLStrings;
139 | }
140 | }
141 |
142 | public static String getCrawlID(final MultiProtocolURL url, final Date date, final int count) {
143 | String id = url.getHost();
144 | if (id.length() > 80) id = id.substring(0, 80) + "-" + id.hashCode();
145 | id = id + "-" + DateParser.secondDateFormat.format(date).replace(':', '-').replace(' ', '-') + "-" + count;
146 | return id;
147 | }
148 |
149 | public CrawlerListener(final Configuration config, final YaCyServices service) {
150 | super(config, service, Runtime.getRuntime().availableProcessors());
151 |
152 | this.blacklist_crawler_names_list = config.properties.get("grid.crawler.blacklist").split(",");
153 | this.blacklist_indexer_names_list = config.properties.get("grid.indexer.blacklist").split(",");
154 | this.blacklists_crawler = new ConcurrentHashMap<>();
155 | this.blacklists_indexer = new ConcurrentHashMap<>();
156 | }
157 |
158 | private final Blacklist getBlacklistCrawler(final String processName, final int processNumber) {
159 | final String key = processName + "_" + processNumber;
160 | Blacklist blacklist = this.blacklists_crawler.get(key);
161 | if (blacklist == null) {
162 | this.blacklists_crawler.put(key, blacklist = loadBlacklist(this.blacklist_crawler_names_list));
163 | }
164 | return blacklist;
165 | }
166 |
167 | private final Blacklist getBlacklistIndexer(final String processName, final int processNumber) {
168 | final String key = processName + "_" + processNumber;
169 | Blacklist blacklist = this.blacklists_indexer.get(key);
170 | if (blacklist == null) {
171 | this.blacklists_indexer.put(key, blacklist = loadBlacklist(this.blacklist_indexer_names_list));
172 | }
173 | return blacklist;
174 | }
175 |
176 | private final Blacklist loadBlacklist(final String[] names) {
177 | final Blacklist blacklist = new Blacklist();
178 | for (final String name: names) {
179 | File f = new File(super.config.gridServicePath, "conf/" + name.trim());
180 | if (!f.exists()) f = new File("conf/" + name.trim());
181 | if (!f.exists()) continue;
182 | try {
183 | blacklist.load(f);
184 | } catch (final IOException e) {
185 | Logger.warn(this.getClass(), e);
186 | }
187 | }
188 | return blacklist;
189 | }
190 |
191 | @Override
192 | public ActionResult processAction(final SusiAction crawlaction, final JSONArray data, final String processName, final int processNumber) {
193 | doDoubleCleanup();
194 | final String crawl_id = crawlaction.getStringAttr("id");
195 | String user_id = crawlaction.getStringAttr("user_id");
196 | if (user_id == null || user_id.length() == 0) user_id = User.ANONYMOUS_ID;
197 | JSONArray user_ids = crawlaction.getArrayAttr("user_ids");
198 | if (user_ids == null) user_ids = new JSONArray();
199 | if (user_id != null && user_id.length() > 0 && !user_ids.toList().contains(user_id)) user_ids.put(user_id);
200 |
201 | if (crawl_id == null || crawl_id.length() == 0) {
202 | Logger.info("Crawler.processAction Fail: Action does not have an id: " + crawlaction.toString());
203 | return ActionResult.FAIL_IRREVERSIBLE;
204 | }
205 | final JSONObject crawl = SusiThought.selectData(data, "id", crawl_id);
206 | if (crawl == null) {
207 | Logger.info(this.getClass(), "Crawler.processAction Fail: ID of Action not found in data: " + crawlaction.toString());
208 | return ActionResult.FAIL_IRREVERSIBLE;
209 | }
210 |
211 | final boolean archiveWARC = crawl.optBoolean("archiveWARC");
212 | final boolean archiveIndex = crawl.optBoolean("archiveIndex");
213 | final boolean archiveGraph = crawl.optBoolean("archiveGraph");
214 |
215 | final int depth = crawlaction.getIntAttr("depth");
216 | final int crawlingDepth = crawl.getInt("crawlingDepth");
217 | final int priority = crawl.has("priority") ? crawl.getInt("priority") : 0;
218 | // check depth (this check should be deprecated because we limit by omitting the crawl message at crawl tree leaves)
219 | if (depth > crawlingDepth) {
220 | // this is a leaf in the crawl tree (it does not mean that the crawl is finished)
221 | Logger.info(this.getClass(), "Crawler.processAction Leaf: reached a crawl leaf for crawl " + crawl_id + ", depth = " + crawlingDepth);
222 | return ActionResult.SUCCESS;
223 | }
224 | final boolean isCrawlLeaf = depth == crawlingDepth;
225 |
226 | // load graph
227 | final String sourcegraph = crawlaction.getStringAttr("sourcegraph");
228 | if (sourcegraph == null || sourcegraph.length() == 0) {
229 | Logger.info(this.getClass(), "Crawler.processAction Fail: sourcegraph of Action is empty: " + crawlaction.toString());
230 | return ActionResult.FAIL_IRREVERSIBLE;
231 | }
232 | try {
233 | JSONList jsonlist = null;
234 | if (crawlaction.hasAsset(sourcegraph)) {
235 | jsonlist = crawlaction.getJSONListAsset(sourcegraph);
236 | }
237 | if (jsonlist == null) try {
238 | final Asset graphasset = super.config.gridStorage.load(sourcegraph); // this must be a list of json, containing document links
239 | final byte[] graphassetbytes = graphasset.getPayload();
240 | jsonlist = new JSONList(new ByteArrayInputStream(graphassetbytes));
241 | } catch (final IOException e) {
242 | Logger.warn(this.getClass(), "Crawler.processAction could not read asset from storage: " + sourcegraph, e);
243 | return ActionResult.FAIL_IRREVERSIBLE;
244 | }
245 |
246 | // declare filter from the crawl profile
247 | final String mustmatchs = crawl.optString("mustmatch");
248 | final Pattern mustmatch = Pattern.compile(mustmatchs);
249 | final String mustnotmatchs = crawl.optString("mustnotmatch");
250 | final Pattern mustnotmatch = Pattern.compile(mustnotmatchs);
251 | // filter for indexing steering
252 | final String indexmustmatchs = crawl.optString("indexmustmatch");
253 | final Pattern indexmustmatch = Pattern.compile(indexmustmatchs);
254 | final String indexmustnotmatchs = crawl.optString("indexmustnotmatch");
255 | final Pattern indexmustnotmatch = Pattern.compile(indexmustnotmatchs);
256 | // attributes for new crawl entries
257 | final String collectionss = crawl.optString("collection");
258 | final Map collections = WebMapping.collectionParser(collectionss);
259 | final String start_url = crawl.optString("start_url");
260 | final String start_ssld = crawl.optString("start_ssld");
261 |
262 | final Date now = new Date();
263 | final long timestamp = now.getTime();
264 | // For each of the parsed document, there is a target graph.
265 | // The graph contains all url elements which may appear in a document.
266 | // In the following loop we collect all urls which may be of interest for the next depth of the crawl.
267 | final Map nextMap = new HashMap<>(); // a map from urlid to url
268 | final Blacklist blacklist_crawler = getBlacklistCrawler(processName, processNumber);
269 | final List crawlerDocuments = new ArrayList<>();
270 | graphloop: for (int line = 0; line < jsonlist.length(); line++) {
271 | final JSONObject json = jsonlist.get(line);
272 | if (json.has("index")) continue graphloop; // this is an elasticsearch index directive, we just skip that
273 |
274 | final String sourceurl = json.has(WebMapping.url_s.getMapping().name()) ? json.getString(WebMapping.url_s.getMapping().name()) : "";
275 | final Set graph = new HashSet<>();
276 | final String graphurl = json.has(WebMapping.canonical_s.name()) ? json.getString(WebMapping.canonical_s.name()) : null;
277 | if (graphurl != null) try {
278 | graph.add(new MultiProtocolURL(graphurl));
279 | } catch (final MalformedURLException e) {
280 | Logger.warn(this.getClass(), "Crawler.processAction error when starting crawl with canonical url " + graphurl, e);
281 | }
282 | for (final String field: FIELDS_IN_GRAPH) {
283 | if (json.has(field)) {
284 | final JSONArray a = json.getJSONArray(field);
285 | urlloop: for (int i = 0; i < a.length(); i++) {
286 | final String u = a.getString(i);
287 | try {
288 | graph.add(new MultiProtocolURL(u));
289 | } catch (final MalformedURLException e) {
290 | Logger.warn(this.getClass(), "Crawler.processAction we discovered a bad follow-up url: " + u, e);
291 | continue urlloop;
292 | }
293 | }
294 | }
295 | }
296 |
297 | // sort out doubles and apply filters
298 | DoubleCache doublecache = null;
299 | if (!this.doubles.containsKey(crawl_id)) this.doubles.put(crawl_id, new DoubleCache());
300 | doublecache = this.doubles.get(crawl_id);
301 | Logger.info(this.getClass(), "Crawler.processAction processing sub-graph with " + graph.size() + " urls for url " + sourceurl);
302 | urlcheck: for (final MultiProtocolURL url: graph) {
303 | // prepare status document
304 | final ContentDomain cd = url.getContentDomainFromExt();
305 |
306 | if (cd == ContentDomain.TEXT || cd == ContentDomain.ALL) {
307 | // check if the url shall be loaded using the constraints
308 | final String u = url.toNormalform(true);
309 | final String urlid = Digest.encodeMD5Hex(u);
310 |
311 | // double check with the fast double cache
312 | if (doublecache.doubleHashes.contains(urlid)) {
313 | continue urlcheck;
314 | }
315 | doublecache.doubleHashes.add(urlid);
316 |
317 | // create new crawl status document
318 | final CrawlerDocument crawlStatus = new CrawlerDocument()
319 | .setCrawlID(crawl_id)
320 | .setUserlID(user_id)
321 | .setMustmatch(mustmatchs)
322 | .setCollections(collections.keySet())
323 | .setCrawlstartURL(start_url)
324 | .setCrawlstartSSLD(start_ssld)
325 | .setInitDate(now)
326 | .setStatusDate(now)
327 | .setURL(u);
328 |
329 | // check matcher rules
330 | if (!mustmatch.matcher(u).matches() || mustnotmatch.matcher(u).matches()) {
331 | crawlStatus
332 | .setStatus(Status.rejected)
333 | .setComment(!mustmatch.matcher(u).matches() ? "url does not match must-match filter " + mustmatchs : "url matches mustnotmatch filter " + mustnotmatchs);
334 | crawlerDocuments.add(crawlStatus);
335 | continue urlcheck;
336 | }
337 |
338 | // check blacklist (this is costly because the blacklist is huge)
339 | final Blacklist.BlacklistInfo blacklistInfo = blacklist_crawler.isBlacklisted(u, url);
340 | if (blacklistInfo != null) {
341 | Logger.info(this.getClass(), "Crawler.processAction crawler blacklist pattern '" + blacklistInfo.matcher.pattern().toString() + "' removed url '" + u + "' from crawl list " + blacklistInfo.source + ": " + blacklistInfo.info);
342 | crawlStatus
343 | .setStatus(Status.rejected)
344 | .setComment("url matches blacklist");
345 | crawlerDocuments.add(crawlStatus);
346 | continue urlcheck;
347 | }
348 |
349 | // double check with the elastic index (we do this late here because it is the most costly operation)
350 | //if (config.gridIndex.exist(GridIndex.CRAWLER_INDEX_NAME, GridIndex.EVENT_TYPE_NAME, urlid)) {
351 | // continue urlcheck;
352 | //}
353 |
354 | // add url to next stack
355 | nextMap.put(urlid, u);
356 | }
357 | };
358 | }
359 |
360 | if (!nextMap.isEmpty()) {
361 |
362 | // make a double-check
363 | final String crawlerIndexName = super.config.properties.getOrDefault("grid.elasticsearch.indexName.crawler", GridIndex.DEFAULT_INDEXNAME_CRAWLER);
364 | final Set exist = super.config.gridIndex.existBulk(crawlerIndexName, nextMap.keySet());
365 | for (final String u: exist) nextMap.remove(u);
366 | final Collection nextList = nextMap.values(); // a set of urls
367 |
368 | // divide the nextList into two sub-lists, one which will reach the indexer and another one which will not cause indexing
369 | @SuppressWarnings("unchecked")
370 | final
371 | List[] indexNoIndex = new List[2];
372 | indexNoIndex[0] = new ArrayList<>(); // for: index
373 | indexNoIndex[1] = new ArrayList<>(); // for: no-Index
374 | final Blacklist blacklist_indexer = getBlacklistIndexer(processName, processNumber);
375 | nextList.forEach(url -> {
376 | final boolean indexConstratntFromCrawlProfil = indexmustmatch.matcher(url).matches() && !indexmustnotmatch.matcher(url).matches();
377 | final Blacklist.BlacklistInfo blacklistInfo = blacklist_indexer.isBlacklisted(url, null);
378 | final boolean indexConstraintFromBlacklist = blacklistInfo == null;
379 | if (indexConstratntFromCrawlProfil && indexConstraintFromBlacklist) {
380 | indexNoIndex[0].add(url);
381 | } else {
382 | indexNoIndex[1].add(url);
383 | }
384 | });
385 |
386 | for (int ini = 0; ini < 2; ini++) {
387 |
388 | // create crawler index entries
389 | for (final String u: indexNoIndex[ini]) {
390 | final CrawlerDocument crawlStatus = new CrawlerDocument()
391 | .setCrawlID(crawl_id)
392 | .setUserlID(user_id)
393 | .setMustmatch(mustmatchs)
394 | .setCollections(collections.keySet())
395 | .setCrawlstartURL(start_url)
396 | .setCrawlstartSSLD(start_ssld)
397 | .setInitDate(now)
398 | .setStatusDate(now)
399 | .setStatus(Status.accepted)
400 | .setURL(u)
401 | .setComment(ini == 0 ? "to be indexed" : "noindex, just for crawling");
402 | crawlerDocuments.add(crawlStatus);
403 | }
404 |
405 | // create partitions
406 | final List partitions = createPartition(indexNoIndex[ini], 8);
407 |
408 | // create follow-up crawl to next depth
409 | for (int pc = 0; pc < partitions.size(); pc++) {
410 | final JSONObject loaderAction = newLoaderAction(
411 | priority, crawl_id, user_id, user_ids, partitions.get(pc), depth, isCrawlLeaf,
412 | 0, timestamp + ini, pc, depth < crawlingDepth, ini == 0,
413 | archiveWARC, archiveIndex, archiveGraph); // action includes whole hierarchy of follow-up actions
414 | final SusiThought nextjson = new SusiThought()
415 | .setData(data)
416 | .addAction(new SusiAction(loaderAction));
417 |
418 | // put a loader message on the queue
419 | final String message = nextjson.toString(2);
420 | final byte[] b = message.getBytes(StandardCharsets.UTF_8);
421 | try {
422 | final Services serviceName = YaCyServices.valueOf(loaderAction.getString("type"));
423 | final GridQueue queueName = new GridQueue(loaderAction.getString("queue"));
424 | super.config.gridBroker.send(serviceName, queueName, b);
425 | } catch (final IOException e) {
426 | Logger.warn(this.getClass(), "error when starting crawl with message " + message, e);
427 | }
428 | };
429 | }
430 | }
431 | // bulk-store the crawler documents
432 | final Map crawlerDocumentsMap = new HashMap<>();
433 | crawlerDocuments.forEach(crawlerDocument -> {
434 | final String url = crawlerDocument.getURL();
435 | if (url != null && url.length() > 0) {
436 | final String id = Digest.encodeMD5Hex(url);
437 | crawlerDocumentsMap.put(id, crawlerDocument);
438 | } else {
439 | assert false : "url not set / storeBulk";
440 | }
441 | });
442 | CrawlerDocument.storeBulk(super.config, super.config.gridIndex, crawlerDocumentsMap);
443 | Logger.info(this.getClass(), "Crawler.processAction processed graph with " + jsonlist.length()/2 + " subgraphs from " + sourcegraph);
444 | return ActionResult.SUCCESS;
445 | } catch (final Throwable e) {
446 | Logger.warn(this.getClass(), "Crawler.processAction Fail: loading of sourcegraph failed: " + e.getMessage() /*+ "\n" + crawlaction.toString()*/, e);
447 | return ActionResult.FAIL_IRREVERSIBLE;
448 | }
449 | }
450 |
451 | private static List createPartition(final Collection urls, final int partitionSize) {
452 | final List partitions = new ArrayList<>();
453 | urls.forEach(url -> {
454 | int c = partitions.size();
455 | if (c == 0 || partitions.get(c - 1).length() >= partitionSize) {
456 | partitions.add(new JSONArray());
457 | c++;
458 | }
459 | partitions.get(c - 1).put(url);
460 | });
461 | return partitions;
462 | }
463 |
464 | /**
465 | * Create a new loader action. This action contains all follow-up actions after
466 | * loading to create a steering of parser, indexing and follow-up crawler actions.
467 | * @param priority the prioroty of the crawl
468 | * @param id the crawl id
469 | * @param user_id the id of the user (9 digit number)
470 | * @param user_ids all users which have that domin as crawl assigned
471 | * @param urls the urls which are part of the same actions
472 | * @param depth the depth of the crawl step (0 is start depth)
473 | * @param retry the number of load re-tries (0 is no retry, shows that this is the first attempt)
474 | * @param timestamp the current time when the crawler created the action
475 | * @param partition unique number of the url set partition. This is used to create asset names.
476 | * @param doCrawling flag: if true, create a follow-up crawling action. set this to false to terminate crawling afterwards
477 | * @param doIndexing flag: if true, do an indexing after loading. set this to false if the purpose is only a follow-up crawl after parsing
478 | * @return the action json
479 | * @throws IOException
480 | */
481 | private JSONObject newLoaderAction(
482 | final int priority,
483 | final String id,
484 | final String user_id,
485 | final JSONArray user_ids,
486 | final JSONArray urls,
487 | final int depth,
488 | final boolean isCrawlLeaf,
489 | final int retry,
490 | final long timestamp,
491 | final int partition,
492 | final boolean doCrawling,
493 | final boolean doIndexing,
494 | final boolean archiveWARC,
495 | final boolean archiveIndex,
496 | final boolean archiveGraph) throws IOException {
497 | // create file names for the assets: this uses depth and partition information
498 | final SimpleDateFormat FORMAT_TIMEF = new SimpleDateFormat(PATTERN_TIMEF, Locale.US); // we must create this here to prevent concurrency bugs which are there in the date formatter :((
499 | final String basepath = "/data/aaaaa/accounting/" + user_id + "/";
500 | final String docname = "d" + intf(depth, 2) + "-t" + FORMAT_TIMEF.format(new Date(timestamp)) + "-p" + intf(partition, 4);
501 | final String warcasset = basepath + "warc/" + id + "/" + docname + ".warc.gz";
502 | final String indexasset = basepath + "index/" + id + "/" + docname + ".index.jsonlist";
503 | final String graphasset = basepath + "graph/" + id + "/" + docname + ".graph.jsonlist";
504 | final String hashKey = new MultiProtocolURL(urls.getString(0)).getHost();
505 |
506 | // create actions to be done in reverse order:
507 | // at the end of the processing we simultaneously place actions on the indexing and crawling queue
508 | final JSONArray postParserActions = new JSONArray();
509 | assert doIndexing || doCrawling; // one or both must be true; doing none of that does not make sense
510 | // if all of the urls shall be indexed (see indexing patterns) then do indexing actions
511 | if (doIndexing) {
512 | final GridQueue indexerQueueName = super.config.gridBroker.queueName(YaCyServices.indexer, YaCyServices.indexer.getSourceQueues(), ShardingMethod.LEAST_FILLED, INDEXER_PRIORITY_DIMENSIONS, priority, hashKey);
513 | postParserActions.put(new JSONObject(true)
514 | .put("type", YaCyServices.indexer.name())
515 | .put("queue", indexerQueueName.name())
516 | .put("id", id)
517 | .put("user_id", user_id)
518 | .put("user_ids", user_ids)
519 | .put("sourceasset", indexasset)
520 | .put("archiveindex", archiveIndex)
521 | );
522 | }
523 | // if all of the urls shall be crawled at depth + 1, add a crawling action. Don't do this only if the crawling depth is at the depth limit.
524 | if (doCrawling) {
525 | final GridQueue crawlerQueueName = super.config.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getSourceQueues(), ShardingMethod.LEAST_FILLED, CRAWLER_PRIORITY_DIMENSIONS, priority, hashKey);
526 | postParserActions.put(new JSONObject(true)
527 | .put("type", YaCyServices.crawler.name())
528 | .put("queue", crawlerQueueName.name())
529 | .put("id", id)
530 | .put("user_id", user_id)
531 | .put("user_ids", user_ids)
532 | .put("depth", depth + 1)
533 | .put("sourcegraph", graphasset)
534 | .put("archivegraph", archiveGraph)
535 | );
536 | }
537 |
538 | // before that and after loading we have a parsing action
539 | final GridQueue parserQueueName = super.config.gridBroker.queueName(YaCyServices.parser, YaCyServices.parser.getSourceQueues(), ShardingMethod.LEAST_FILLED, PARSER_PRIORITY_DIMENSIONS, priority, hashKey);
540 | final JSONArray parserActions = new JSONArray().put(new JSONObject(true)
541 | .put("type", YaCyServices.parser.name())
542 | .put("queue", parserQueueName.name())
543 | .put("id", id)
544 | .put("user_id", user_id)
545 | .put("user_ids", user_ids)
546 | .put("sourceasset", warcasset)
547 | .put("targetasset", indexasset)
548 | .put("targetgraph", graphasset)
549 | .put("archivewarc", archiveWARC)
550 | .put("archiveindex", archiveIndex)
551 | .put("archivegraph", archiveGraph)
552 | .put("actions", postParserActions)); // actions after parsing
553 |
554 | // at the beginning of the process, we do a loading.
555 | final GridQueue loaderQueueName = super.config.gridBroker.queueName(YaCyServices.loader, YaCyServices.loader.getSourceQueues(), isCrawlLeaf ? ShardingMethod.LEAST_FILLED : ShardingMethod.BALANCE, LOADER_PRIORITY_DIMENSIONS, priority, hashKey);
556 | final JSONObject loaderAction = new JSONObject(true)
557 | .put("type", YaCyServices.loader.name())
558 | .put("queue", loaderQueueName.name())
559 | .put("id", id)
560 | .put("user_id", user_id)
561 | .put("user_ids", user_ids)
562 | .put("urls", urls)
563 | .put("targetasset", warcasset)
564 | .put("archivewarc", archiveWARC)
565 | .put("actions", parserActions); // actions after loading
566 | return loaderAction;
567 | }
568 |
569 | private final static String intf(final int i, final int len) {
570 | String s = Integer.toString(i);
571 | while (s.length() < len) s = '0' + s;
572 | return s;
573 | }
574 |
575 | @Override
576 | public Telemetry getTelemetry() {
577 | return null;
578 | }
579 | }
580 |
--------------------------------------------------------------------------------