├── .github └── FUNDING.yml ├── bin ├── restart.sh ├── start.sh ├── stop.sh └── start_loader_docker.sh ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── .dockerignore ├── .gitignore ├── .gitmodules ├── .settings ├── org.eclipse.jdt.core.prefs └── org.eclipse.buildship.core.prefs ├── src └── main │ ├── resources │ └── log4j.properties │ └── java │ └── net │ └── yacy │ └── grid │ └── loader │ ├── retrieval │ ├── HttpClient.java │ ├── JavaHttpClient.java │ ├── LoaderClientConnection.java │ ├── HtmlUnitLoader.java │ ├── ContentLoader.java │ └── FTPClient.java │ ├── api │ ├── LoaderService.java │ └── ProcessService.java │ ├── JwatWarcWriter.java │ ├── Loader.java │ └── LoaderListener.java ├── .project ├── Dockerfile ├── .classpath ├── conf └── config.properties ├── README.md ├── gradlew.bat └── gradlew /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: orbiter 2 | patreon: 0rb1t3r 3 | -------------------------------------------------------------------------------- /bin/restart.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | ./stop.sh 4 | sleep 1 5 | ./start.sh 6 | 7 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yacy/yacy_grid_loader/master/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | data 4 | build 5 | bin 6 | docker 7 | Dockerfile 8 | LICENSE.md 9 | README.md 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | classes/ 2 | target/ 3 | data/ 4 | /class/ 5 | /.gradle/ 6 | /build/ 7 | .DS_Store 8 | .settings 9 | .idea/ 10 | 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/yacy_grid_mcp"] 2 | path = submodules/yacy_grid_mcp 3 | url = https://github.com/yacy/yacy_grid_mcp.git 4 | -------------------------------------------------------------------------------- /bin/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | cd .. 4 | nohup java -jar build/libs/yacy_grid_loader-0.0.1-SNAPSHOT-all.jar < /dev/null & 5 | sleep 1 6 | echo "YaCy Grid Loader started!" 7 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.source=1.8 5 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 10 | 11 | log4j.logger.org.eclipse.jetty = INFO 12 | log4j.logger.org.apache.http = INFO 13 | -------------------------------------------------------------------------------- /bin/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | cd "`dirname $0`" 3 | cd ../data 4 | KILLFILE="loader-8200.kill" 5 | PIDFILE="loader-8200.pid" 6 | 7 | # first method to terminate the process 8 | if [ -f "$KILLFILE" ]; 9 | then 10 | rm $KILLFILE 11 | echo "termination requested, waiting.." 12 | # this can take 10 seconds.. 13 | sleep 10 14 | fi 15 | 16 | # second method to terminate the process 17 | if [ -f "$PIDFILE" ]; 18 | then 19 | fuser -k $PIDFILE 20 | fi 21 | 22 | # check if file does not exist any more which would be a sign that this has terminated 23 | if [ ! -f "$PIDFILE" ]; 24 | then 25 | echo "process terminated" 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /.settings/org.eclipse.buildship.core.prefs: -------------------------------------------------------------------------------- 1 | arguments= 2 | auto.sync=false 3 | build.commands=org.eclipse.jdt.core.javabuilder 4 | build.scans.enabled=false 5 | connection.arguments= 6 | connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(5.6.2)) 7 | connection.java.home=null 8 | connection.jvm.arguments= 9 | connection.project.dir= 10 | derived.resources=.gradle,build 11 | eclipse.preferences.version=1 12 | gradle.user.home= 13 | java.home= 14 | jvm.arguments= 15 | natures=org.eclipse.jdt.core.javanature 16 | offline.mode=false 17 | override.workspace.settings=true 18 | project.path=\: 19 | show.console.view=true 20 | show.executions.view=true 21 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | yacy_grid_loader 4 | Project yacy_grid_loader created by Buildship. 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.buildship.core.gradleprojectbuilder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.buildship.core.gradleprojectnature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ## yacy_grid_loader dockerfile 2 | ## examples: 3 | # docker build -t yacy_grid_loader . 4 | # docker run -d --rm -p 8200:8200 --name yacy_grid_loader yacy_grid_loader 5 | ## Check if the service is running: 6 | # curl http://localhost:8200/yacy/grid/mcp/info/status.json 7 | 8 | # build app 9 | FROM eclipse-temurin:8-jdk-focal AS appbuilder 10 | COPY ./ /app 11 | WORKDIR /app 12 | RUN ./gradlew clean shadowDistTar 13 | 14 | # build dist 15 | FROM eclipse-temurin:8-jre-focal 16 | LABEL maintainer="Michael Peter Christen " 17 | ENV DEBIAN_FRONTEND noninteractive 18 | ARG default_branch=master 19 | COPY ./conf /app/conf/ 20 | COPY --from=appbuilder /app/build/libs/ ./app/build/libs/ 21 | WORKDIR /app 22 | EXPOSE 8200 23 | 24 | # for some weird reason the jar file is sometimes not named correctly 25 | RUN if [ -e /app/build/libs/app-0.0.1-SNAPSHOT-all.jar ] ; then mv /app/build/libs/app-0.0.1-SNAPSHOT-all.jar /app/build/libs/yacy_grid_loader-0.0.1-SNAPSHOT-all.jar; fi 26 | 27 | CMD ["java", "-Xms320M", "-Xmx3G", "-jar", "/app/build/libs/yacy_grid_loader-0.0.1-SNAPSHOT-all.jar"] 28 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/HttpClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * HttpClient 3 | * Copyright 24.2.2018 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.retrieval; 21 | 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | public interface HttpClient { 26 | 27 | public int getStatusCode(); 28 | 29 | public String getMime(); 30 | 31 | public Map> getHeader(); 32 | 33 | public String getRequestHeader(); 34 | 35 | public String getResponseHeader(); 36 | 37 | public byte[] getContent(); 38 | } 39 | -------------------------------------------------------------------------------- /bin/start_loader_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "`dirname $0`" 3 | 4 | bindhost="127.0.0.1" 5 | callhost="localhost" 6 | appname="YaCy Grid Loader" 7 | containername=yacy-grid-loader 8 | imagename=${containername//-/_} 9 | dockerfile="Dockerfile" 10 | production=false 11 | open=false 12 | 13 | usage() { echo "usage: $0 [-o | --open | -p | --production | --arm32 | --arm64 ]" 1>&2; exit 1; } 14 | 15 | while [[ $# -gt 0 ]]; do 16 | case "$1" in 17 | -p | --production ) production=true; shift 1;; 18 | -o | --open ) open=true; shift 1;; 19 | --arm32 ) imagename=${imagename}:arm32; dockerfile=${dockerfile}_arm32; shift 1;; 20 | --arm64 ) imagename=${imagename}:arm64; dockerfile=${dockerfile}_arm64; shift 1;; 21 | -h | --help | -* | --* | * ) usage;; 22 | esac 23 | done 24 | if [ "$production" = true ] ; then imagename="yacy/${imagename}"; fi 25 | if [ "$open" = true ] ; then bindhost="0.0.0.0"; callhost=`hostname`; fi 26 | 27 | containerRuns=$(docker ps | grep -i "${containername}" | wc -l ) 28 | containerExists=$(docker ps -a | grep -i "${containername}" | wc -l ) 29 | if [ ${containerRuns} -gt 0 ]; then 30 | echo "${appname} container is already running" 31 | elif [ ${containerExists} -gt 0 ]; then 32 | docker start ${containername} 33 | echo "${appname} container re-started" 34 | else 35 | if [[ $imagename != "yacy/"*":latest" ]] && [[ "$(docker images -q ${imagename} 2> /dev/null)" == "" ]]; then 36 | cd .. 37 | docker build -t ${imagename} -f ${dockerfile} . 38 | cd bin 39 | fi 40 | docker run -d --restart=unless-stopped -p ${bindhost}:8200:8200 \ 41 | --link yacy-grid-minio --link yacy-grid-rabbitmq --link yacy-grid-elasticsearch --link yacy-grid-mcp \ 42 | -e YACYGRID_GRID_MCP_ADDRESS=yacy-grid-mcp \ 43 | --name ${containername} ${imagename} 44 | echo "${appname} started." 45 | fi 46 | docker ps -a --format "table {{.ID}}\t{{.Image}}\t{{.Names}}\t{{.Mounts}}\t{{.Ports}}" 47 | 48 | echo "To get the app status, open http://${callhost}:8200/yacy/grid/mcp/info/status.json" 49 | -------------------------------------------------------------------------------- /conf/config.properties: -------------------------------------------------------------------------------- 1 | port = 8200 2 | grid.mcp.address = 127.0.0.1:8100,node00.local:8100,brain.local:8100,searchlab.eu:8100 3 | grid.broker.lazy = true 4 | grid.broker.queue.limit = 0 5 | grid.broker.queue.throttling = 100000 6 | grid.assets.delete = true 7 | grid.loader.disableHeadless = false 8 | 9 | # setting for the user agent type: 10 | # the type is either CUSTOM, YACY, GOOGLE or BROWSER. That means: 11 | # - CUSTOM : user your own user agent. The name must be set in the property grid.loader.userAgentName 12 | # - YACY : use the YaCyBot user agent, i.e. "yacybot (v2 sysinfo) http://yacy.net/bot.html" 13 | # - GOOGLE : the googlebot user agent, i.e. "Googlebot/2.1 (+http://www.google.com/bot.html)" 14 | # - BROWSER : a random browser user agent, i.e. "Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0" 15 | grid.loader.userAgentType = BROWSER 16 | 17 | 18 | 19 | #################################################################### 20 | ## The following properties must be identical to those in the MCP ## 21 | #################################################################### 22 | 23 | # The grid name is used to separate different grid networks. 24 | # Only networks with the same name connect with each other 25 | grid.name = freeworld 26 | 27 | # Index names of the grid indexes: 28 | # crawlstart : a history of all crawl starts 29 | # crawler : tracking of crawling progress 30 | # query : a history of all queries 31 | # web : the document search index ("web index", there) 32 | grid.elasticsearch.indexName.crawlstart = crawlstart 33 | grid.elasticsearch.indexName.crawler = crawler 34 | grid.elasticsearch.indexName.query = query 35 | grid.elasticsearch.indexName.web = web 36 | 37 | # the following type name is an intermediate solution to migrate from elastic 6.x to 8.x 38 | # unfortunately the current index type name is 'web' but in future elastic versions the name '_doc' 39 | # is mandatory. We will use this setting until migration to elastic 8.x is complete and delete 40 | # the configuration afterwards. 41 | grid.elasticsearch.typeName = web -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YaCy Grid Component: Loader 2 | 3 | The YaCy Grid is the second-generation implementation of YaCy, a peer-to-peer search engine. 4 | A YaCy Grid installation consists of a set of micro-services which communicate with each other 5 | using the MCP, see https://github.com/yacy/yacy_grid_mcp 6 | 7 | ## Purpose 8 | 9 | The Loader is a microservice which can be deployed i.e. using Docker. 10 | Each search engine needs a file loader and this component will do that work. 11 | The special feature of this loader is it's embedded headless browser which makes 12 | it possible to load rich content and provide that content for a search engine. 13 | 14 | ## What it does 15 | 16 | When the Loader component is started, it searches for a MCP and connects to it. 17 | By default the local host is searched for a MCP but you can configure one yourself. 18 | 19 | The Loader will then wait for client requests and performs web loading upon request. 20 | It also has a MCP queue listener to react on loading requests in the working queues. 21 | After loading of content the loader will push back results to the MCP storage and puts 22 | another message on the MCP message queue to process the loaded content. 23 | 24 | ## Installation: Download, Build, Run 25 | At this time, yacy_grid_parser is not provided in compiled form, you easily build it yourself. It's not difficult and done in one minute! The source code is hosted at https://github.com/yacy/yacy_grid_loader, you can download and run it with: 26 | 27 | > git clone --recursive https://github.com/yacy/yacy_grid_loader.git 28 | 29 | If you just want to make a update, do the following 30 | 31 | > git pull origin master 32 | > git submodule foreach git pull origin master 33 | 34 | To build and start the loader, run 35 | 36 | > cd yacy_grid_loader 37 | > gradle run 38 | 39 | Please read also https://github.com/yacy/yacy_grid_mcp/blob/master/README.md for further details. 40 | 41 | 42 | ## Contribute 43 | 44 | This is a community project and your contribution is welcome! 45 | 46 | 1. Check for [open issues](https://github.com/yacy/yacy_grid_loader/issues) 47 | or open a fresh one to start a discussion around a feature idea or a bug. 48 | 2. Fork [the repository](https://github.com/yacy/yacy_grid_loader.git) 49 | on GitHub to start making your changes (branch off of the master branch). 50 | 3. Write a test that shows the bug was fixed or the feature works as expected. 51 | 4. Send a pull request and bug us on Gitter until it gets merged and published. :) 52 | 53 | 54 | ## What is the software license? 55 | LGPL 2.1 56 | 57 | Have fun! 58 | 59 | @0rb1t3r 60 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/api/LoaderService.java: -------------------------------------------------------------------------------- 1 | /** 2 | * LoaderService 3 | * Copyright 25.4.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.api; 21 | 22 | import javax.servlet.http.HttpServletResponse; 23 | 24 | import org.json.JSONArray; 25 | import org.json.JSONObject; 26 | 27 | import ai.susi.mind.SusiAction; 28 | import ai.susi.mind.SusiThought; 29 | import net.yacy.grid.http.APIHandler; 30 | import net.yacy.grid.http.ObjectAPIHandler; 31 | import net.yacy.grid.http.Query; 32 | import net.yacy.grid.http.ServiceResponse; 33 | import net.yacy.grid.loader.retrieval.ContentLoader; 34 | 35 | /** 36 | * 37 | * Test URL: 38 | * http://localhost:8200/yacy/grid/loader/warcloader.warc.gz?url=http://yacy.net 39 | * 40 | * Test command: 41 | * curl -o yacy.net.warc.gz "http://localhost:8200/yacy/grid/loader/warcloader.warc.gz?collection=test&url=http://yacy.net" 42 | * parse this warc with: 43 | * curl -X POST -F "sourcebytes=@yacy.net.warc.gz;type=application/octet-stream" http://127.0.0.1:8500/yacy/grid/parser/parser.json 44 | */ 45 | public class LoaderService extends ObjectAPIHandler implements APIHandler { 46 | 47 | private static final long serialVersionUID = 8578474303031749879L; 48 | public static final String NAME = "warcloader"; 49 | 50 | @Override 51 | public String getAPIPath() { 52 | return "/yacy/grid/loader/" + NAME + ".warc.gz"; 53 | } 54 | 55 | @Override 56 | public ServiceResponse serviceImpl(Query call, HttpServletResponse response) { 57 | // construct the same process as if it was submitted on a queue 58 | SusiThought process = ProcessService.queryToProcess(call); 59 | 60 | // extract call parameter here to enhance ability to debug 61 | SusiAction action = process.getActions().get(0); 62 | JSONArray processData = process.getData(); 63 | 64 | // find out if we should do headless loading 65 | String crawlID = action.getStringAttr("id"); 66 | JSONObject crawl = SusiThought.selectData(processData, "id", crawlID); 67 | int depth = action.getIntAttr("depth"); 68 | int crawlingDepth = crawl.getInt("crawlingDepth"); 69 | int priority = crawl.has("priority") ? crawl.getInt("priority") : 0; 70 | boolean loaderHeadless = crawl.has("loaderHeadless") ? crawl.getBoolean("loaderHeadless") : true; 71 | 72 | // construct a WARC 73 | String threadname = "api call from " + call.getClientHost(); 74 | ContentLoader cl = new ContentLoader(action, processData, true, threadname, crawlID, depth, crawlingDepth, loaderHeadless, priority); 75 | byte[] b = cl.getContent(); 76 | 77 | // store the WARC as asset if wanted 78 | return new ServiceResponse(b); 79 | } 80 | 81 | } 82 | 83 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/JavaHttpClient.java: -------------------------------------------------------------------------------- 1 | package net.yacy.grid.loader.retrieval; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.net.HttpURLConnection; 8 | import java.net.URL; 9 | import java.nio.charset.StandardCharsets; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | import org.apache.http.Header; 14 | import org.apache.http.RequestLine; 15 | 16 | import net.yacy.grid.http.ClientConnection; 17 | import net.yacy.grid.http.ClientIdentification; 18 | 19 | public class JavaHttpClient implements HttpClient { 20 | 21 | private static final String CRLF = new String(ClientConnection.CRLF, StandardCharsets.US_ASCII); 22 | private static String userAgentDefault = ClientIdentification.browserAgent.userAgent; 23 | 24 | private int status_code; 25 | private String mime; 26 | private Map> header; 27 | private String requestHeader, responseHeader; 28 | private byte[] content; 29 | 30 | public static void initClient(String userAgent) { 31 | userAgentDefault = userAgent; 32 | } 33 | 34 | public JavaHttpClient(String url, boolean head) throws IOException { 35 | 36 | HttpURLConnection connection = ((HttpURLConnection) new URL(url).openConnection()); 37 | if (head) connection.setRequestMethod("HEAD"); 38 | connection.addRequestProperty("User-Agent", userAgentDefault); 39 | 40 | // compute the request header (we do this to have a documentation later of what we did) 41 | Map> map = connection.getRequestProperties(); 42 | StringBuffer sb = new StringBuffer(); 43 | String special = connection.getHeaderField(0); 44 | sb.append(connection.getRequestMethod() + " " + url).append(CRLF); 45 | for (Map.Entry> entry: connection.getRequestProperties().entrySet()) { 46 | String key = entry.getKey(); 47 | for (String value: entry.getValue()) { 48 | sb.append(key).append(": ").append(value).append(CRLF); 49 | } 50 | } 51 | sb.append(CRLF); 52 | this.requestHeader = sb.toString(); 53 | 54 | 55 | InputStream input; 56 | if (connection.getResponseCode() == 200) // this must be called before 'getErrorStream()' works 57 | input = connection.getInputStream(); 58 | else input = connection.getErrorStream(); 59 | BufferedReader reader = new BufferedReader(new InputStreamReader(input)); 60 | String msg; 61 | while ((msg =reader.readLine()) != null) 62 | System.out.println(msg); 63 | } 64 | 65 | 66 | @Override 67 | public int getStatusCode() { 68 | return status_code; 69 | } 70 | 71 | @Override 72 | public String getMime() { 73 | return mime; 74 | } 75 | 76 | @Override 77 | public Map> getHeader() { 78 | return header; 79 | } 80 | 81 | @Override 82 | public String getRequestHeader() { 83 | return requestHeader; 84 | } 85 | 86 | @Override 87 | public String getResponseHeader() { 88 | return responseHeader; 89 | } 90 | 91 | @Override 92 | public byte[] getContent() { 93 | return this.content; 94 | } 95 | 96 | public static void main(String[] args) { 97 | try { 98 | JavaHttpClient client = new JavaHttpClient("https://krefeld.polizei.nrw/", true); 99 | } catch (IOException e) { 100 | e.printStackTrace(); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/JwatWarcWriter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * JwatWarcWriter 3 | * Copyright 11.5.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader; 21 | 22 | import java.io.ByteArrayInputStream; 23 | import java.io.IOException; 24 | import java.security.MessageDigest; 25 | import java.security.NoSuchAlgorithmException; 26 | import java.util.Date; 27 | 28 | import org.apache.commons.codec.binary.Base32; 29 | import org.jwat.warc.WarcRecord; 30 | import org.jwat.warc.WarcWriter; 31 | 32 | import net.yacy.grid.tools.DateParser; 33 | import net.yacy.grid.tools.Logger; 34 | 35 | /** 36 | * for a documentation, see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ 37 | * @author admin 38 | * 39 | */ 40 | public class JwatWarcWriter { 41 | 42 | public static void writeWarcinfo(final WarcWriter writer, final Date date, final String warcinfo_uuid, final String filename, final byte[] payload) throws IOException { 43 | try { 44 | final WarcRecord record = WarcRecord.createRecord(writer); 45 | record.header.addHeader("WARC-Type", "warcinfo"); 46 | if (warcinfo_uuid != null) record.header.addHeader("WARC-Record-ID", ""); 47 | record.header.addHeader("WARC-Date", DateParser.iso8601Format.format(date)); 48 | if (filename != null) record.header.addHeader("WARC-Filename", filename); 49 | record.header.addHeader("Content-Length", Long.toString(payload.length)); 50 | record.header.addHeader("Content-Type", "application/warc-fields"); 51 | writer.writeHeader(record); 52 | final ByteArrayInputStream inBytes = new ByteArrayInputStream(payload); 53 | writer.streamPayload(inBytes); 54 | writer.closeRecord(); // java.lang.NoSuchMethodError: java.nio.ByteBuffer.flip()Ljava/nio/ByteBuffer; 55 | } catch (final NoSuchMethodError e) { 56 | Logger.warn(e); 57 | throw new IOException(e.getMessage()); 58 | // the writer may fail because of a java 8 class error 59 | /* 60 | java.lang.NoSuchMethodError: java.nio.ByteBuffer.flip()Ljava/nio/ByteBuffer; 61 | at org.jwat.gzip.GzipWriter$GzipEntryOutputStream.close(GzipWriter.java:513) 62 | at org.jwat.gzip.GzipEntry.close(GzipEntry.java:142) 63 | at org.jwat.warc.WarcWriterCompressed.closeRecord(WarcWriterCompressed.java:100) 64 | */ 65 | } 66 | } 67 | 68 | public static void writeRequest(final WarcWriter writer, final String url, final String ip, final Date date, final String warcrecord_uuid, final String warcinfo_uuid, final byte[] payload) throws IOException { 69 | final WarcRecord record = WarcRecord.createRecord(writer); 70 | record.header.addHeader("WARC-Type", "request"); 71 | record.header.addHeader("WARC-Target-URI", url); 72 | record.header.addHeader("Content-Type", "application/http;msgtype=request"); 73 | record.header.addHeader("WARC-Date", DateParser.iso8601Format.format(date)); 74 | if (warcrecord_uuid != null) record.header.addHeader("WARC-Record-ID", ""); 75 | if (ip != null) record.header.addHeader("WARC-IP-Address", ip); 76 | if (warcinfo_uuid != null) record.header.addHeader("WARC-Warcinfo-ID", ""); 77 | //record.header.addHeader("WARC-Block-Digest", "sha1:" + sha1(payload)); 78 | record.header.addHeader("Content-Length", Long.toString(payload.length)); 79 | writer.writeHeader(record); 80 | final ByteArrayInputStream inBytes = new ByteArrayInputStream(payload); 81 | writer.streamPayload(inBytes); 82 | writer.closeRecord(); 83 | } 84 | 85 | public static void writeResponse(final WarcWriter writer, final String url, final String ip, final Date date, final String warcrecord_uuid, final String warcinfo_uuid, final byte[] payload) throws IOException { 86 | final WarcRecord record = WarcRecord.createRecord(writer); 87 | record.header.addHeader("WARC-Type", "response"); 88 | if (warcrecord_uuid != null) record.header.addHeader("WARC-Record-ID", ""); 89 | if (warcinfo_uuid != null) record.header.addHeader("WARC-Warcinfo-ID", ""); 90 | record.header.addHeader("WARC-Target-URI", url); 91 | record.header.addHeader("WARC-Date", DateParser.iso8601Format.format(date)); 92 | if (ip != null) record.header.addHeader("WARC-IP-Address", ip); 93 | //record.header.addHeader("WARC-Block-Digest", "sha1:" + sha1(payload)); 94 | //record.header.addHeader("WARC-Payload-Digest", "sha1:" + sha1(payload)); 95 | record.header.addHeader("Content-Type", "application/http;msgtype=response"); 96 | record.header.addHeader("Content-Length", Long.toString(payload.length)); 97 | writer.writeHeader(record); 98 | final ByteArrayInputStream inBytes = new ByteArrayInputStream(payload); 99 | writer.streamPayload(inBytes); 100 | writer.closeRecord(); 101 | } 102 | 103 | /** 104 | * compute a sha1 in base32 format 105 | * We choosed that format, because WGET does the same 106 | * @param b 107 | * @return a base32 string for the sha1 of the input 108 | */ 109 | public static String sha1(final byte[] b) { 110 | try { 111 | final MessageDigest sha1 = MessageDigest.getInstance("SHA-1"); 112 | sha1.reset(); 113 | sha1.update(b); 114 | return new Base32().encodeAsString(b); 115 | } catch (final NoSuchAlgorithmException e) { 116 | e.printStackTrace(); 117 | return ""; 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/api/ProcessService.java: -------------------------------------------------------------------------------- 1 | /** 2 | * LoaderService 3 | * Copyright 25.4.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.api; 21 | 22 | import java.io.IOException; 23 | 24 | import javax.servlet.http.HttpServletResponse; 25 | 26 | import org.json.JSONArray; 27 | import org.json.JSONObject; 28 | 29 | import ai.susi.mind.SusiAction; 30 | import ai.susi.mind.SusiAction.RenderType; 31 | import ai.susi.mind.SusiThought; 32 | import net.yacy.grid.http.APIHandler; 33 | import net.yacy.grid.http.ObjectAPIHandler; 34 | import net.yacy.grid.http.Query; 35 | import net.yacy.grid.http.ServiceResponse; 36 | import net.yacy.grid.loader.retrieval.ContentLoader; 37 | import net.yacy.grid.mcp.Service; 38 | 39 | /** 40 | * 41 | * Test URL: 42 | * http://localhost:8200/yacy/grid/loader/warcloader.warc.gz?url=http://yacy.net 43 | * 44 | * Test command: 45 | * curl "http://localhost:8200/yacy/grid/loader/warcprocess.json?collection=test&targetasset=test/yacy.net.warc.gz&url=http://yacy.net" 46 | * places the warc file on the asset store 47 | */ 48 | public class ProcessService extends ObjectAPIHandler implements APIHandler { 49 | 50 | private static final long serialVersionUID = 8578474303031749879L; 51 | public static final String NAME = "warcprocess"; 52 | 53 | @Override 54 | public String getAPIPath() { 55 | return "/yacy/grid/loader/" + NAME + ".json"; 56 | } 57 | 58 | @Override 59 | public ServiceResponse serviceImpl(final Query call, final HttpServletResponse response) { 60 | // construct the same process as if it was submitted on a queue 61 | final SusiThought process = queryToProcess(call); 62 | final SusiAction action = process.getActions().iterator().next(); 63 | final JSONArray data = process.getData(); 64 | 65 | // find out if we should do headless loading 66 | final String crawlID = action.getStringAttr("id"); 67 | final JSONObject crawl = SusiThought.selectData(data, "id", crawlID); 68 | final int depth = action.getIntAttr("depth"); 69 | final int crawlingDepth = crawl.getInt("crawlingDepth"); 70 | final int priority = crawl.has("priority") ? crawl.getInt("priority") : 0; 71 | final boolean loaderHeadless = crawl.has("loaderHeadless") ? crawl.getBoolean("loaderHeadless") : true; 72 | 73 | // construct a WARC 74 | final String targetasset = process.getObservation("targetasset"); 75 | final ContentLoader cl = new ContentLoader( 76 | process.getActions().get(0), process.getData(), targetasset.endsWith(".gz"), "api call from " + call.getClientHost(), 77 | crawlID, depth, crawlingDepth, loaderHeadless, priority); 78 | final byte[] b = cl.getContent(); 79 | 80 | // store the WARC as asset if wanted 81 | final JSONObject json = new JSONObject(true); 82 | if (targetasset != null && targetasset.length() > 0) { 83 | try { 84 | Service.instance.config.gridStorage.store(targetasset, b); 85 | json.put(ObjectAPIHandler.SUCCESS_KEY, true); 86 | json.put(ObjectAPIHandler.COMMENT_KEY, "asset stored"); 87 | } catch (final IOException e) { 88 | e.printStackTrace(); 89 | json.put(ObjectAPIHandler.SUCCESS_KEY, false); 90 | json.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage()); 91 | } 92 | } else { 93 | json.put(ObjectAPIHandler.SUCCESS_KEY, false); 94 | json.put(ObjectAPIHandler.COMMENT_KEY, "this process requires a 'targetasset' attribute"); 95 | } 96 | return new ServiceResponse(json); 97 | } 98 | 99 | public static SusiThought queryToProcess(final Query call) { 100 | // read query attributes 101 | final String id = call.get("id", "*id*"); // the crawl id 102 | String url = call.get("url", ""); 103 | final int urlCount = call.get("urlCount", 0); 104 | final int depth = call.get("depth", 0); 105 | final int crawlingDepth = call.get("crawlingDepth", 0); // the maximum depth for the crawl start of this domain 106 | final boolean loaderHeadless = call.get("loaderHeadless", false); 107 | final int priority = call.get("priority", 0); 108 | final String collection = call.get("collection", ""); 109 | final String targetasset = call.get("targetasset", ""); 110 | 111 | // construct an object that could be taken from the queue server 112 | final SusiThought process = new SusiThought(); 113 | process.setProcess("yacy_grid_loader"); 114 | if (collection.length() > 0) process.addObservation("collection", collection); 115 | 116 | final JSONObject crawl = new JSONObject(); 117 | crawl.put("id", id); 118 | crawl.put("start_url", url); 119 | crawl.put("crawlingDepth", crawlingDepth); 120 | crawl.put("priority", priority); 121 | crawl.put("loaderHeadless", loaderHeadless); 122 | 123 | // create action 124 | final JSONObject action = new JSONObject(); 125 | final JSONArray urls = new JSONArray(); 126 | if (url.length() > 0) urls.put(url); 127 | if (urlCount > 0) for (int i = 0; i < urlCount; i++) { 128 | url = call.get("url_" + i, ""); 129 | if (url.length() > 0) urls.put(url); 130 | } 131 | action.put("id", id); 132 | action.put("type", RenderType.loader.name()); 133 | action.put("queue", "loader"); 134 | action.put("urls", urls); 135 | action.put("depth", depth); 136 | if (collection.length() > 0) action.put("collection", collection); 137 | if (targetasset.length() > 0) action.put("targetasset", targetasset); 138 | process.addAction(new SusiAction(action)); 139 | process.setData(new JSONArray().put(crawl)); 140 | 141 | return process; 142 | } 143 | 144 | } 145 | 146 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/Loader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Loader 3 | * Copyright 25.04.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader; 21 | 22 | import java.util.ArrayList; 23 | import java.util.Arrays; 24 | import java.util.List; 25 | 26 | import javax.servlet.Servlet; 27 | 28 | import net.yacy.grid.YaCyServices; 29 | import net.yacy.grid.http.ClientConnection; 30 | import net.yacy.grid.http.ClientIdentification; 31 | import net.yacy.grid.loader.api.LoaderService; 32 | import net.yacy.grid.loader.api.ProcessService; 33 | import net.yacy.grid.loader.retrieval.LoaderClientConnection; 34 | import net.yacy.grid.mcp.BrokerListener; 35 | import net.yacy.grid.mcp.Configuration; 36 | import net.yacy.grid.mcp.MCP; 37 | import net.yacy.grid.mcp.Service; 38 | import net.yacy.grid.tools.CronBox; 39 | import net.yacy.grid.tools.CronBox.Telemetry; 40 | import net.yacy.grid.tools.Logger; 41 | 42 | /** 43 | * The Loader main class 44 | * 45 | * performance debugging: 46 | * http://localhost:8200/yacy/grid/mcp/info/threaddump.txt 47 | * http://localhost:8200/yacy/grid/mcp/info/threaddump.txt?count=100 * 48 | */ 49 | public class Loader { 50 | 51 | private final static YaCyServices LOADER_SERVICE = YaCyServices.loader; // check with http://localhost:8200/yacy/grid/mcp/status.json 52 | private final static String DATA_PATH = "data"; 53 | 54 | // define services 55 | @SuppressWarnings("unchecked") 56 | public final static Class[] LOADER_SERVICES = new Class[]{ 57 | // app services 58 | LoaderService.class, 59 | ProcessService.class 60 | }; 61 | 62 | public static class Application implements CronBox.Application { 63 | 64 | final Configuration config; 65 | final Service service; 66 | final BrokerListener brokerApplication; 67 | final CronBox.Application serviceApplication; 68 | 69 | public Application() { 70 | Logger.info("Starting Crawler Application..."); 71 | 72 | // initialize configuration 73 | final List> services = new ArrayList<>(); 74 | services.addAll(Arrays.asList(MCP.MCP_SERVLETS)); 75 | services.addAll(Arrays.asList(LOADER_SERVICES)); 76 | this.config = new Configuration(DATA_PATH, true, LOADER_SERVICE, services.toArray(new Class[services.size()])); 77 | 78 | // initialize loader with user agent 79 | String userAgent = ClientIdentification.getAgent(ClientIdentification.googleAgentName/*.yacyInternetCrawlerAgentName*/).userAgent; 80 | String userAgentType = this.config.properties.get("grid.loader.userAgentType"); 81 | if (userAgentType == null || userAgentType.length() == 0) userAgentType = "BROWSER"; 82 | if ("CUSTOM".equals(userAgentType)) userAgent = this.config.properties.get("grid.lodeer.userAgentName"); 83 | else if ("YACY".equals(userAgentType)) userAgent = ClientIdentification.yacyInternetCrawlerAgent.userAgent; 84 | else if ("GOOGLE".equals(userAgentType)) userAgent = ClientIdentification.getAgent(ClientIdentification.googleAgentName).userAgent; 85 | else userAgent = ClientIdentification.getAgent(ClientIdentification.browserAgentName).userAgent; 86 | LoaderClientConnection.userAgent = userAgent; 87 | 88 | // initialize REST server with services 89 | this.service = new Service(this.config); 90 | 91 | // connect backend 92 | this.config.connectBackend(); 93 | 94 | // initiate broker application: listening to indexing requests at RabbitMQ 95 | final boolean disableHeadless = this.config.properties.containsKey("grid.loader.disableHeadless") ? Boolean.parseBoolean(this.config.properties.get("grid.loader.disableHeadless")) : false; 96 | this.brokerApplication = new LoaderListener(LOADER_SERVICE, disableHeadless); 97 | 98 | // initiate service application: listening to REST request 99 | this.serviceApplication = this.service.newServer(null); 100 | } 101 | 102 | @Override 103 | public void run() { 104 | 105 | Logger.info("Grid Name: " + this.config.properties.get("grid.name")); 106 | 107 | // starting threads 108 | new Thread(this.brokerApplication).start(); 109 | this.serviceApplication.run(); // SIC! the service application is running as the core element of this run() process. If we run it concurrently, this runnable will be "dead". 110 | } 111 | 112 | @Override 113 | public void stop() { 114 | Logger.info("Stopping MCP Application..."); 115 | this.serviceApplication.stop(); 116 | this.brokerApplication.stop(); 117 | this.service.stop(); 118 | this.service.close(); 119 | this.config.close(); 120 | } 121 | 122 | @Override 123 | public Telemetry getTelemetry() { 124 | return null; 125 | } 126 | 127 | } 128 | 129 | public static void main(final String[] args) { 130 | // run in headless mode 131 | System.setProperty("java.awt.headless", "true"); // no awt used here so we can switch off that stuff 132 | 133 | // Debug Info 134 | boolean assertionenabled = false; 135 | assert (assertionenabled = true) == true; // compare to true to remove warning: "Possible accidental assignement" 136 | if (assertionenabled) Logger.info("Asserts are enabled"); 137 | 138 | // first greeting 139 | Logger.info("YaCy Grid Loader started!"); 140 | 141 | // run application with cron 142 | final long cycleDelay = Long.parseLong(System.getProperty("YACYGRID_LOADER_CYCLEDELAY", "" + Long.MAX_VALUE)); // by default, run only in one genesis thread 143 | final int cycleRandom = Integer.parseInt(System.getProperty("YACYGRID_LOADER_CYCLERANDOM", "" + 1000 * 60 /*1 minute*/)); 144 | final CronBox cron = new CronBox(Application.class, cycleDelay, cycleRandom); 145 | cron.cycle(); 146 | 147 | // this line is reached if the cron process was shut down 148 | Logger.info("YaCy Grid Loader terminated"); 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/LoaderListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * LoaderListener 3 | * Copyright 25.04.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader; 21 | 22 | import org.json.JSONArray; 23 | import org.json.JSONObject; 24 | 25 | import ai.susi.mind.SusiAction; 26 | import ai.susi.mind.SusiThought; 27 | import net.yacy.grid.YaCyServices; 28 | import net.yacy.grid.loader.retrieval.ContentLoader; 29 | import net.yacy.grid.mcp.AbstractBrokerListener; 30 | import net.yacy.grid.mcp.BrokerListener; 31 | import net.yacy.grid.mcp.Service; 32 | import net.yacy.grid.tools.CronBox.Telemetry; 33 | import net.yacy.grid.tools.Logger; 34 | import net.yacy.grid.tools.Memory; 35 | 36 | /** 37 | * broker listener, takes process messages from the queue "loader", "webloader" 38 | * i.e. test with: 39 | * curl -X POST -F "message=@job.json" -F "serviceName=loader" -F "queueName=webloader" http://yacygrid.com:8100/yacy/grid/mcp/messages/send.json 40 | * where job.json is: 41 | { 42 | "metadata": { 43 | "process": "yacy_grid_loader", 44 | "count": 1 45 | }, 46 | "data": [{ 47 | "crawlingMode": "url", 48 | "crawlingURL": "http://yacy.net", 49 | "sitemapURL": "", 50 | "crawlingFile": "", 51 | "crawlingDepth": 3, 52 | "crawlingDepthExtension": "", 53 | "range": "domain", 54 | "mustmatch": ".*", 55 | "mustnotmatch": "", 56 | "ipMustmatch": ".*", 57 | "ipMustnotmatch": "", 58 | "indexmustmatch": ".*", 59 | "indexmustnotmatch": "", 60 | "deleteold": "off", 61 | "deleteIfOlderNumber": 0, 62 | "deleteIfOlderUnit": "day", 63 | "recrawl": "nodoubles", 64 | "reloadIfOlderNumber": 0, 65 | "reloadIfOlderUnit": "day", 66 | "crawlingDomMaxCheck": "off", 67 | "crawlingDomMaxPages": 1000, 68 | "crawlingQ": "off", 69 | "cachePolicy": "if fresh", 70 | "collection": "user", 71 | "agentName": "yacybot (yacy.net; crawler from yacygrid.com)", 72 | "user": "anonymous@nowhere.com", 73 | "client": "yacygrid.com" 74 | }], 75 | "actions": [{ 76 | "type": "loader", 77 | "queue": "webloader", 78 | "urls": ["http://yacy.net"], 79 | "collection": "test", 80 | "targetasset": "test3/yacy.net.warc.gz", 81 | "actions": [{ 82 | "type": "parser", 83 | "queue": "yacyparser", 84 | "sourceasset": "test3/yacy.net.warc.gz", 85 | "targetasset": "test3/yacy.net.jsonlist", 86 | "targetgraph": "test3/yacy.net.graph.json", 87 | "actions": [{ 88 | "type": "indexer", 89 | "queue": "elasticsearch", 90 | "sourceasset": "test3/yacy.net.jsonlist" 91 | },{ 92 | "type": "crawler", 93 | "queue": "webcrawler", 94 | "sourceasset": "test3/yacy.net.graph.json" 95 | } 96 | ] 97 | }] 98 | }] 99 | } 100 | * 101 | * to check the queue content, see http://www.searchlab.eu:15672/ 102 | */ 103 | public class LoaderListener extends AbstractBrokerListener implements BrokerListener { 104 | 105 | private final boolean disableHeadless; 106 | 107 | public LoaderListener(final YaCyServices service, final boolean disableHeadless) { 108 | super(Service.instance.config, service, Runtime.getRuntime().availableProcessors()); 109 | this.disableHeadless = disableHeadless; 110 | } 111 | 112 | @Override 113 | public ActionResult processAction(final SusiAction action, final JSONArray processData, final String processName, final int processNumber) { 114 | 115 | // check short memory status 116 | if (Memory.shortStatus()) { 117 | Logger.info(this.getClass(), "Loader short memory status: assigned = " + Memory.assigned() + ", used = " + Memory.used()); 118 | } 119 | 120 | // find out if we should do headless loading 121 | final String crawlID = action.getStringAttr("id"); 122 | if (crawlID == null || crawlID.length() == 0) { 123 | Logger.info(this.getClass(), "Loader.processAction Fail: Action does not have an id: " + action.toString()); 124 | return ActionResult.FAIL_IRREVERSIBLE; 125 | } 126 | final JSONObject crawl = SusiThought.selectData(processData, "id", crawlID); 127 | if (crawl == null) { 128 | Logger.info(this.getClass(), "Loader.processAction Fail: ID of Action not found in data: " + action.toString()); 129 | return ActionResult.FAIL_IRREVERSIBLE; 130 | } 131 | final int depth = action.getIntAttr("depth"); 132 | final int crawlingDepth = crawl.getInt("crawlingDepth"); 133 | final int priority = crawl.has("priority") ? crawl.getInt("priority") : 0; 134 | boolean loaderHeadless = crawl.has("loaderHeadless") ? crawl.getBoolean("loaderHeadless") : true; 135 | if (this.disableHeadless) loaderHeadless = false; 136 | 137 | final String targetasset = action.getStringAttr("targetasset"); 138 | final boolean archivewarc = action.getBooleanAttr("archivewarc"); 139 | final String threadnameprefix = processName + "-" + processNumber; 140 | Thread.currentThread().setName(threadnameprefix + " targetasset=" + targetasset); 141 | if (targetasset != null && targetasset.length() > 0) { 142 | ActionResult actionResult = ActionResult.SUCCESS; 143 | final byte[] b; 144 | try { 145 | final ContentLoader cl = new ContentLoader(action, processData, targetasset.endsWith(".gz"), threadnameprefix, crawlID, depth, crawlingDepth, loaderHeadless, priority); 146 | b = cl.getContent(); 147 | actionResult = cl.getResult(); 148 | } catch (final Throwable e) { 149 | Logger.warn(this.getClass(), e); 150 | return ActionResult.FAIL_IRREVERSIBLE; 151 | } 152 | if (actionResult == ActionResult.FAIL_IRREVERSIBLE) { 153 | Logger.info(this.getClass(), "Loader.processAction FAILED processed message for targetasset " + targetasset); 154 | return actionResult; 155 | } 156 | Logger.info(this.getClass(), "Loader.processAction SUCCESS processed message for targetasset " + targetasset); 157 | boolean storeToMessage = true; // debug version for now: always true TODO: set to false later 158 | // ATTENTION: we should not send binaries larger than 512MB to RabbitMQ, see https://github.com/rabbitmq/rabbitmq-server/issues/147#issuecomment-470882099 159 | if (!storeToMessage || (archivewarc && Service.instance.config.gridStorage.isS3Connected())) { 160 | try { 161 | Service.instance.config.gridStorage.store(targetasset, b); 162 | Logger.info(this.getClass(), "Loader.processAction stored asset " + targetasset); 163 | } catch (final Throwable e) { 164 | Logger.warn(this.getClass(), "Loader.processAction asset " + targetasset + " could not be stored, carrying the asset within the next action", e); 165 | storeToMessage = true; 166 | } 167 | } 168 | if (storeToMessage) { 169 | final JSONArray actions = action.getEmbeddedActions(); 170 | actions.forEach(a -> 171 | new SusiAction((JSONObject) a).setBinaryAsset(targetasset, b) 172 | ); 173 | Logger.info(this.getClass(), "Loader.processAction stored asset " + targetasset + " into message"); 174 | } 175 | Logger.info(this.getClass(), "Loader.processAction processed message from queue and stored asset " + targetasset); 176 | 177 | // success (has done something) 178 | return actionResult; 179 | } 180 | 181 | // fail (nothing done) 182 | return ActionResult.FAIL_IRREVERSIBLE; 183 | } 184 | 185 | @Override 186 | public Telemetry getTelemetry() { 187 | return null; 188 | } 189 | } -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 84 | 85 | APP_NAME="Gradle" 86 | APP_BASE_NAME=${0##*/} 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | MAX_FD=$( ulimit -H -n ) || 147 | warn "Could not query maximum file descriptor limit" 148 | esac 149 | case $MAX_FD in #( 150 | '' | soft) :;; #( 151 | *) 152 | ulimit -n "$MAX_FD" || 153 | warn "Could not set maximum file descriptor limit to $MAX_FD" 154 | esac 155 | fi 156 | 157 | # Collect all arguments for the java command, stacking in reverse order: 158 | # * args from the command line 159 | # * the main class name 160 | # * -classpath 161 | # * -D...appname settings 162 | # * --module-path (only if needed) 163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 164 | 165 | # For Cygwin or MSYS, switch paths to Windows format before running java 166 | if "$cygwin" || "$msys" ; then 167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 169 | 170 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 171 | 172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 173 | for arg do 174 | if 175 | case $arg in #( 176 | -*) false ;; # don't mess with options #( 177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 178 | [ -e "$t" ] ;; #( 179 | *) false ;; 180 | esac 181 | then 182 | arg=$( cygpath --path --ignore --mixed "$arg" ) 183 | fi 184 | # Roll the args list around exactly as many times as the number of 185 | # args, so each arg winds up back in the position where it started, but 186 | # possibly modified. 187 | # 188 | # NB: a `for` loop captures its iteration list before it begins, so 189 | # changing the positional parameters here affects neither the number of 190 | # iterations, nor the values presented in `arg`. 191 | shift # remove old arg 192 | set -- "$@" "$arg" # push replacement arg 193 | done 194 | fi 195 | 196 | # Collect all arguments for the java command; 197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 198 | # shell script including quotes and variable substitutions, so put them in 199 | # double quotes to make sure that they get re-expanded; and 200 | # * put everything else in single quotes, so that it's not re-expanded. 201 | 202 | set -- \ 203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 204 | -classpath "$CLASSPATH" \ 205 | org.gradle.wrapper.GradleWrapperMain \ 206 | "$@" 207 | 208 | # Use "xargs" to parse quoted args. 209 | # 210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 211 | # 212 | # In Bash we could simply go: 213 | # 214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 215 | # set -- "${ARGS[@]}" "$@" 216 | # 217 | # but POSIX shell has neither arrays nor command substitution, so instead we 218 | # post-process each arg (as a line of input to sed) to backslash-escape any 219 | # character that might be a shell metacharacter, then use eval to reverse 220 | # that process (while maintaining the separation between arguments), and wrap 221 | # the whole thing up as a single "set" statement. 222 | # 223 | # This will of course break if any of these variables contains a newline or 224 | # an unmatched quote. 225 | # 226 | 227 | eval "set -- $( 228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 229 | xargs -n1 | 230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 231 | tr '\n' ' ' 232 | )" '"$@"' 233 | 234 | exec "$JAVACMD" "$@" 235 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/LoaderClientConnection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ApacheHttpClient 3 | * Copyright 24.2.2018 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.retrieval; 21 | 22 | import java.io.BufferedInputStream; 23 | import java.io.ByteArrayOutputStream; 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.net.SocketTimeoutException; 27 | import java.net.UnknownHostException; 28 | import java.nio.charset.StandardCharsets; 29 | import java.util.ArrayList; 30 | import java.util.HashMap; 31 | import java.util.List; 32 | import java.util.Map; 33 | import java.util.concurrent.Executors; 34 | import java.util.concurrent.ScheduledExecutorService; 35 | import java.util.concurrent.TimeUnit; 36 | 37 | import javax.net.ssl.SSLHandshakeException; 38 | 39 | import org.apache.http.Header; 40 | import org.apache.http.HttpEntity; 41 | import org.apache.http.HttpResponse; 42 | import org.apache.http.RequestLine; 43 | import org.apache.http.client.config.RequestConfig; 44 | import org.apache.http.client.methods.HttpGet; 45 | import org.apache.http.client.methods.HttpHead; 46 | import org.apache.http.client.methods.HttpRequestBase; 47 | import org.apache.http.conn.HttpHostConnectException; 48 | import org.apache.http.impl.client.CloseableHttpClient; 49 | import org.apache.http.impl.client.HttpClientBuilder; 50 | import org.apache.http.util.EntityUtils; 51 | 52 | import net.yacy.grid.http.ClientConnection; 53 | import net.yacy.grid.http.ClientIdentification; 54 | import net.yacy.grid.tools.Logger; 55 | 56 | public class LoaderClientConnection implements HttpClient { 57 | 58 | private static final String CRLF = new String(ClientConnection.CRLF, StandardCharsets.US_ASCII); 59 | 60 | public static String userAgent = ClientIdentification.browserAgent.userAgent; 61 | private static CloseableHttpClient httpClient = ClientConnection.getClosableHttpClient(userAgent); 62 | private static ScheduledExecutorService executorService = Executors.newScheduledThreadPool(30); 63 | 64 | static { 65 | RequestConfig config = RequestConfig.custom() 66 | .setConnectTimeout(10000) 67 | .setConnectionRequestTimeout(10000) 68 | .setSocketTimeout(10000).build(); 69 | httpClient = 70 | HttpClientBuilder.create().setDefaultRequestConfig(config).build(); 71 | } 72 | 73 | private int status_code; 74 | private String mime; 75 | private final Map> header; 76 | private final String requestHeader; 77 | 78 | private String responseHeader; 79 | private byte[] content; 80 | 81 | public LoaderClientConnection(final String url, final boolean head) throws IOException { 82 | this.status_code = -1; 83 | this.content = null; 84 | this.mime = ""; 85 | this.header = new HashMap>(); 86 | 87 | final HttpRequestBase request = head ? new HttpHead(url) : new HttpGet(url); 88 | request.setHeader("User-Agent", userAgent); 89 | request.setHeader("Accept", "text/html, image/gif, image/jpeg, *; q=.2, */*; q=.2"); 90 | 91 | // compute the request header (we do this to have a documentation later of what we did) 92 | final StringBuffer sb = new StringBuffer(); 93 | final RequestLine status = request.getRequestLine(); 94 | sb.append(status.toString()).append(CRLF); 95 | for (final Header h: request.getAllHeaders()) { 96 | sb.append(h.getName()).append(": ").append(h.getValue()).append(CRLF); 97 | } 98 | sb.append(CRLF); 99 | this.requestHeader = sb.toString(); 100 | 101 | // do the request 102 | HttpResponse httpResponse = null; 103 | try { 104 | executorService.schedule(request::abort, (long)10, TimeUnit.SECONDS); 105 | httpResponse = httpClient.execute(request); 106 | } catch (final UnknownHostException e) { 107 | request.releaseConnection(); 108 | throw new IOException("client connection failed: unknown host " + request.getURI().getHost()); 109 | } catch (final SocketTimeoutException e) { 110 | request.releaseConnection(); 111 | throw new IOException("client connection timeout for request: " + request.getURI()); 112 | } catch (final SSLHandshakeException e) { 113 | request.releaseConnection(); 114 | throw new IOException("client connection handshake error for domain " + request.getURI().getHost() + ": " + e.getMessage()); 115 | } catch (final HttpHostConnectException e) { 116 | request.releaseConnection(); 117 | throw new IOException("client connection refused for request " + request.getURI() + ": " + e.getMessage()); 118 | } catch (final Throwable e) { 119 | request.releaseConnection(); 120 | throw new IOException("error " + request.getURI() + ": " + e.getMessage()); 121 | } finally { 122 | if (httpResponse != null) { 123 | this.status_code = httpResponse.getStatusLine().getStatusCode(); 124 | final HttpEntity httpEntity = httpResponse.getEntity(); 125 | if (head || this.status_code != 200) { 126 | EntityUtils.consumeQuietly(httpEntity); 127 | if (!head && this.status_code != 200) { 128 | request.releaseConnection(); 129 | throw new IOException("client connection to " + url + " fail (status code " + this.status_code + "): " + httpResponse.getStatusLine().getReasonPhrase()); 130 | } 131 | } else { 132 | try { 133 | final InputStream inputStream = new BufferedInputStream(httpEntity.getContent()); 134 | final ByteArrayOutputStream r = new ByteArrayOutputStream(); 135 | final byte[] b = new byte[1024]; 136 | int c; 137 | while ((c = inputStream.read(b)) > 0) r.write(b, 0, c); 138 | this.content = r.toByteArray(); 139 | } catch (final IOException e) { 140 | throw e; 141 | } 142 | Logger.info(this.getClass(), "ContentLoader loaded " + url); 143 | } 144 | 145 | // read response header and set mime 146 | if (this.status_code == 200 || this.status_code == 403) { 147 | for (final Header h: httpResponse.getAllHeaders()) { 148 | List vals = this.header.get(h.getName()); 149 | if (vals == null) { vals = new ArrayList(); this.header.put(h.getName(), vals); } 150 | vals.add(h.getValue()); 151 | if (h.getName().equals("Content-Type")) this.mime = h.getValue(); 152 | } 153 | } 154 | 155 | // fix mime in case a font is assigned 156 | final int p = this.mime.indexOf(';'); 157 | if (p >= 0) { 158 | String charset = p < this.mime.length() - 2 ? this.mime.substring(p + 2) : ""; 159 | this.mime = this.mime.substring(0, p); 160 | if (charset.startsWith("; charset=")) charset = charset.substring(10); 161 | } 162 | 163 | // compute response header string 164 | sb.setLength(0); 165 | sb.append(status.getProtocolVersion()).append(' ').append(this.status_code).append(CRLF); 166 | for (final Map.Entry> headers: this.header.entrySet()) { 167 | for (final String v: headers.getValue()) { 168 | sb.append(headers.getKey()).append(": ").append(v).append(CRLF); 169 | } 170 | } 171 | sb.append(CRLF); 172 | this.responseHeader = sb.toString(); 173 | } 174 | request.releaseConnection(); 175 | } 176 | } 177 | 178 | @Override 179 | public int getStatusCode() { 180 | return this.status_code; 181 | } 182 | 183 | @Override 184 | public String getMime() { 185 | return this.mime; 186 | } 187 | 188 | @Override 189 | public Map> getHeader() { 190 | return this.header; 191 | } 192 | 193 | @Override 194 | public String getRequestHeader() { 195 | return this.requestHeader; 196 | } 197 | 198 | @Override 199 | public String getResponseHeader() { 200 | return this.responseHeader; 201 | } 202 | 203 | @Override 204 | public byte[] getContent() { 205 | return this.content; 206 | } 207 | 208 | public static void main(final String[] args) { 209 | try { 210 | //final LoaderClientConnection client = new LoaderClientConnection("https://yacy.net", false); 211 | final LoaderClientConnection client = new LoaderClientConnection("https://morrismuseum.org/", false); 212 | 213 | final int status = client.getStatusCode(); 214 | System.out.println("status: " + status); 215 | //String requestHeaders = client.getRequestHeader().toString(); 216 | //String responseHeaders = client.getResponseHeader().toString(); 217 | System.out.println(new String(client.getContent())); 218 | 219 | } catch (final IOException e) { 220 | e.printStackTrace(); 221 | } 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/HtmlUnitLoader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * HtmlUnitLoader 3 | * Copyright 25.4.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.retrieval; 21 | 22 | import java.io.IOException; 23 | import java.net.MalformedURLException; 24 | import java.net.URL; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.TimeZone; 28 | 29 | import com.gargoylesoftware.css.parser.CSSErrorHandler; 30 | import com.gargoylesoftware.css.parser.CSSException; 31 | import com.gargoylesoftware.css.parser.CSSParseException; 32 | import com.gargoylesoftware.htmlunit.BrowserVersion; 33 | import com.gargoylesoftware.htmlunit.HttpMethod; 34 | import com.gargoylesoftware.htmlunit.BrowserVersion.BrowserVersionBuilder; 35 | import com.gargoylesoftware.htmlunit.IncorrectnessListener; 36 | import com.gargoylesoftware.htmlunit.Page; 37 | import com.gargoylesoftware.htmlunit.ScriptException; 38 | import com.gargoylesoftware.htmlunit.TopLevelWindow; 39 | import com.gargoylesoftware.htmlunit.WebClient; 40 | import com.gargoylesoftware.htmlunit.WebClientOptions; 41 | import com.gargoylesoftware.htmlunit.WebRequest; 42 | import com.gargoylesoftware.htmlunit.WebWindow; 43 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 44 | import com.gargoylesoftware.htmlunit.html.parser.HTMLParserListener; 45 | import com.gargoylesoftware.htmlunit.javascript.JavaScriptErrorListener; 46 | import com.gargoylesoftware.htmlunit.util.NameValuePair; 47 | import com.gargoylesoftware.htmlunit.util.UrlUtils; 48 | 49 | import net.yacy.grid.tools.Logger; 50 | import net.yacy.grid.tools.Memory; 51 | 52 | /** 53 | * http://htmlunit.sourceforge.net/ 54 | */ 55 | public class HtmlUnitLoader { 56 | 57 | public static WebClient getClient() { 58 | return getClient(BrowserVersion.CHROME.getUserAgent()); 59 | } 60 | 61 | public static WebClient getClient(String userAgent) { 62 | WebClient webClient = new WebClient(getBrowser(userAgent)); 63 | WebClientOptions options = webClient.getOptions(); 64 | options.setJavaScriptEnabled(true); 65 | options.setCssEnabled(false); 66 | options.setPopupBlockerEnabled(true); 67 | options.setRedirectEnabled(true); 68 | options.setDownloadImages(false); 69 | options.setGeolocationEnabled(false); 70 | options.setPrintContentOnFailingStatusCode(false); 71 | options.setThrowExceptionOnScriptError(false); 72 | options.setMaxInMemory(0); 73 | options.setHistoryPageCacheLimit(0); 74 | options.setHistorySizeLimit(0); 75 | //ProxyConfig proxyConfig = new ProxyConfig(); 76 | //proxyConfig.setProxyHost("127.0.0.1"); 77 | //proxyConfig.setProxyPort(Service.getPort()); 78 | //options.setProxyConfig(proxyConfig); 79 | webClient.getCache().setMaxSize(10000); // this might be a bit large, is regulated with throttling and client cache clear in short memory status 80 | webClient.setIncorrectnessListener(new IncorrectnessListener() { 81 | @Override 82 | public void notify(String arg0, Object arg1) {} 83 | }); 84 | webClient.setCssErrorHandler(new CSSErrorHandler() { 85 | @Override 86 | public void warning(CSSParseException exception) throws CSSException {} 87 | @Override 88 | public void error(CSSParseException exception) throws CSSException {} 89 | @Override 90 | public void fatalError(CSSParseException exception) throws CSSException {} 91 | }); 92 | webClient.setJavaScriptErrorListener(new JavaScriptErrorListener() { 93 | @Override 94 | public void timeoutError(HtmlPage arg0, long arg1, long arg2) {} 95 | @Override 96 | public void scriptException(HtmlPage arg0, ScriptException arg1) {} 97 | @Override 98 | public void malformedScriptURL(HtmlPage arg0, String arg1, MalformedURLException arg2) {} 99 | @Override 100 | public void loadScriptError(HtmlPage arg0, URL arg1, Exception arg2) {} 101 | @Override 102 | public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {} 103 | }); 104 | webClient.setHTMLParserListener(new HTMLParserListener() { 105 | @Override 106 | public void error(String message, URL url, String html, int line, int column, String key) {} 107 | @Override 108 | public void warning(String message, URL url, String html, int line, int column, String key) {} 109 | }); 110 | return webClient; 111 | } 112 | 113 | 114 | private static BrowserVersion getBrowser(String userAgent) { 115 | BrowserVersionBuilder browserBuilder = getBrowserBuilder(); 116 | browserBuilder.setUserAgent(userAgent); 117 | return browserBuilder.build(); 118 | } 119 | 120 | private static BrowserVersionBuilder getBrowserBuilder() { 121 | BrowserVersionBuilder browserBuilder = new BrowserVersion.BrowserVersionBuilder(BrowserVersion.CHROME); 122 | browserBuilder.setSystemTimezone(TimeZone.getDefault()); 123 | return browserBuilder; 124 | } 125 | 126 | private String url, xml, responseHeaders, requestHeaders; 127 | 128 | public String getUrl() { 129 | return this.url; 130 | } 131 | 132 | public String getXml() { 133 | return this.xml; 134 | } 135 | 136 | public String getResponseHeaders() { 137 | return this.responseHeaders; 138 | } 139 | 140 | public String getRequestHeaders() { 141 | return this.requestHeaders; 142 | } 143 | 144 | private String parseRequestHeaders(HttpMethod httpMethod, String url, Map headers) { 145 | String header = String.format("%s %s HTTP/1.1", httpMethod.toString(), url); 146 | 147 | for (Map.Entry entry : headers.entrySet()) { 148 | header = String.format( 149 | "%s\n%s: %s", 150 | header, 151 | entry.getKey(), 152 | entry.getValue() 153 | ); 154 | } 155 | 156 | return String.format("%s\n\n", header); 157 | } 158 | 159 | private String parseResponseHeaders(int statusCode, List headers) { 160 | String header = String.format("HTTP/1.1 %d", statusCode); 161 | 162 | for (NameValuePair nameValuePair : headers) { 163 | header = String.format( 164 | "%s\n%s: %s", 165 | header, 166 | nameValuePair.getName(), 167 | nameValuePair.getValue() 168 | ); 169 | } 170 | 171 | return String.format("%s\n\n", header); 172 | } 173 | 174 | public HtmlUnitLoader(String url, String windowName) throws IOException {// check short memory status 175 | 176 | this.url = url; 177 | HtmlPage page; 178 | try (WebClient client = getClient()) { 179 | long mem0 = Memory.available(); 180 | URL uurl = UrlUtils.toUrlUnsafe(url); 181 | String htmlAcceptHeader = client.getBrowserVersion().getHtmlAcceptHeader(); 182 | WebWindow webWindow = client.openWindow(uurl, windowName); // throws ClassCastException: com.gargoylesoftware.htmlunit.UnexpectedPage cannot be cast to com.gargoylesoftware.htmlunit.html.HtmlPage 183 | WebRequest webRequest = new WebRequest(uurl, htmlAcceptHeader, null); 184 | page = client.getPage(webWindow, webRequest); // com.gargoylesoftware.htmlunit.xml.XmlPage cannot be cast to com.gargoylesoftware.htmlunit.html.HtmlPage 185 | this.xml = page.asXml(); 186 | 187 | this.requestHeaders = this.parseRequestHeaders( 188 | webRequest.getHttpMethod(), 189 | url, 190 | webRequest.getAdditionalHeaders() 191 | ); 192 | 193 | this.responseHeaders = this.parseResponseHeaders( 194 | page.getWebResponse().getStatusCode(), 195 | page.getWebResponse().getResponseHeaders() 196 | ); 197 | 198 | long mem1 = Memory.available(); 199 | Page htmlpage = webWindow.getEnclosedPage(); 200 | htmlpage.cleanUp(); 201 | if (webWindow instanceof TopLevelWindow) ((TopLevelWindow) webWindow).close(); 202 | for (WebWindow ww: client.getWebWindows()) { 203 | if (ww instanceof TopLevelWindow) ((TopLevelWindow) ww).close(); 204 | ww.getJobManager().removeAllJobs(); 205 | } 206 | client.deregisterWebWindow(webWindow); 207 | client.getCache().clear(); 208 | client.close(); 209 | long mem2 = Memory.available(); 210 | Logger.info(this.getClass(), "HtmlUnitLoader loaded " + url + " - " + this.xml.length() + " bytes; used " + (mem1 - mem0) + " bytes, after cleanup " + (mem2 - mem0) + " bytes"); 211 | } catch (Throwable e) { 212 | // there can be many reasons here, i.e. an error in javascript 213 | // we should always treat this as if the error is within the HTMLUnit, not the web page. 214 | // Therefore, we should do a fail-over without HTMLUnit 215 | // Data.logger.warn("HtmlUnitLoader Error loading " + url, e); 216 | // load the page with standard client anyway 217 | // to do this, we throw an IOException here and the caller must handle this 218 | throw new IOException(e.getMessage()); 219 | } 220 | } 221 | 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/ContentLoader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ContentLoader 3 | * Copyright 11.5.2017 by Michael Peter Christen, @0rb1t3r 4 | * 5 | * This library is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU Lesser General Public 7 | * License as published by the Free Software Foundation; either 8 | * version 2.1 of the License, or (at your option) any later version. 9 | * 10 | * This library is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * Lesser General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License 16 | * along with this program in the file lgpl21.txt 17 | * If not, see . 18 | */ 19 | 20 | package net.yacy.grid.loader.retrieval; 21 | 22 | import java.io.ByteArrayOutputStream; 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.io.OutputStream; 26 | import java.nio.charset.StandardCharsets; 27 | import java.text.SimpleDateFormat; 28 | import java.util.ArrayList; 29 | import java.util.Date; 30 | import java.util.HashMap; 31 | import java.util.LinkedHashMap; 32 | import java.util.List; 33 | import java.util.Locale; 34 | import java.util.Map; 35 | import java.util.concurrent.atomic.AtomicLong; 36 | import java.util.regex.Matcher; 37 | import java.util.regex.Pattern; 38 | 39 | import org.json.JSONArray; 40 | import org.jwat.warc.WarcWriter; 41 | import org.jwat.warc.WarcWriterFactory; 42 | 43 | import ai.susi.mind.SusiAction; 44 | import ai.susi.mind.SusiAction.RenderType; 45 | import net.yacy.grid.io.index.CrawlerDocument; 46 | import net.yacy.grid.io.index.CrawlerDocument.Status; 47 | import net.yacy.grid.loader.JwatWarcWriter; 48 | import net.yacy.grid.mcp.BrokerListener.ActionResult; 49 | import net.yacy.grid.mcp.Service; 50 | import net.yacy.grid.tools.Classification; 51 | import net.yacy.grid.tools.Digest; 52 | import net.yacy.grid.tools.Logger; 53 | import net.yacy.grid.tools.MultiProtocolURL; 54 | 55 | public class ContentLoader { 56 | 57 | private final static Pattern charsetPattern = Pattern.compile("charset=([^\\s]+)"); 58 | 59 | private byte[] content; 60 | private ActionResult result; 61 | 62 | public ContentLoader( 63 | final SusiAction action, final JSONArray data, final boolean compressed, final String threadnameprefix, 64 | final String id, final int depth, final int crawlingDepth, final boolean loaderHeadless, final int priority) { 65 | this.content = new byte[0]; 66 | this.result = ActionResult.FAIL_IRREVERSIBLE; 67 | 68 | // this must have a loader action 69 | if (action.getRenderType() != RenderType.loader) { 70 | return; 71 | } 72 | 73 | // extract urls 74 | final JSONArray urls = action.getArrayAttr("urls"); 75 | final List urlss = new ArrayList<>(); 76 | urls.forEach(u -> urlss.add(((String) u))); 77 | final byte[] warcPayload = data.toString(2).getBytes(StandardCharsets.UTF_8); 78 | 79 | // start loading 80 | Thread.currentThread().setName(threadnameprefix + " loading " + urlss.toString()); 81 | 82 | // construct a WARC 83 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 84 | try { 85 | final WarcWriter ww = ContentLoader.initWriter(out, warcPayload, compressed); 86 | final Map errors = ContentLoader.load(ww, urlss, threadnameprefix, id, depth, crawlingDepth, loaderHeadless, priority); 87 | this.result = ActionResult.SUCCESS; 88 | errors.forEach((u, c) -> { 89 | Logger.debug(this.getClass(), "Loader - cannot load: " + u + " - " + c); 90 | if (c == ActionResult.FAIL_RETRY && this.result == ActionResult.SUCCESS) this.result = ActionResult.FAIL_RETRY; 91 | if (c == ActionResult.FAIL_IRREVERSIBLE) this.result = ActionResult.FAIL_IRREVERSIBLE; 92 | }); 93 | } catch (final IOException e) { 94 | Logger.warn(this.getClass(), "ContentLoader WARC writer init problem", e); 95 | } finally { 96 | if (out != null) try {out.close();} catch (final IOException e) {} 97 | } 98 | this.content = ((ByteArrayOutputStream) out).toByteArray(); 99 | this.result = ActionResult.SUCCESS; 100 | } 101 | 102 | 103 | public byte[] getContent() { 104 | return this.content; 105 | } 106 | 107 | public ActionResult getResult() { 108 | return this.result; 109 | } 110 | 111 | private final static SimpleDateFormat millisFormat = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US); 112 | private final static AtomicLong createTempFileCounter = new AtomicLong(0); 113 | public static File createTempFile(final String prefix, final String suffix) throws IOException { 114 | final String tmpprefix = prefix + "-" + millisFormat.format(new Date()) + Long.toString(createTempFileCounter.getAndIncrement()); 115 | final File tmp = File.createTempFile(tmpprefix, suffix); 116 | return tmp; 117 | } 118 | 119 | private static WarcWriter initWriter(final OutputStream out, final byte[] payload, final boolean compressed) throws IOException { 120 | final WarcWriter ww = WarcWriterFactory.getWriter(out, compressed); 121 | JwatWarcWriter.writeWarcinfo(ww, new Date(), null, null, payload); 122 | return ww; 123 | } 124 | 125 | private static Map load( 126 | final WarcWriter warcWriter, final List urls, final String threadName, 127 | final String id, final int depth, final int crawlingDepth, final boolean loaderHeadless, final int priority) throws IOException { 128 | 129 | // this is here for historical reasons, we actually should have all urls normalized 130 | final List fixedURLs = new ArrayList<>(); 131 | urls.forEach(url -> { 132 | if (url.indexOf("//") < 0) url = "http://" + url; 133 | fixedURLs.add(url); 134 | }); 135 | 136 | // prepare map with ids and load crawlerDocuments 137 | final Map urlmap = new HashMap<>(); 138 | fixedURLs.forEach(url -> urlmap.put(url, Digest.encodeMD5Hex(url))); 139 | final Map crawlerDocuments = CrawlerDocument.loadBulk(Service.instance.config, Service.instance.config.gridIndex, urlmap.values()); 140 | 141 | // load content 142 | final Map errors = new LinkedHashMap<>(); 143 | fixedURLs.forEach(url -> { 144 | 145 | // do loader throttling here 146 | long throttling = 250; 147 | try { 148 | throttling = Service.instance.config.gridControl.checkThrottling(id, url, depth, crawlingDepth, loaderHeadless, priority); 149 | } catch (final IOException e1) {} 150 | Thread.currentThread().setName(threadName + " loading " + url.toString() + ", throttling = " + throttling); 151 | try {Thread.sleep(throttling);} catch (final InterruptedException e) {} 152 | 153 | // start loading 154 | try { 155 | // load entry from crawler index 156 | final String urlid = urlmap.get(url); 157 | final CrawlerDocument crawlerDocument = crawlerDocuments.get(urlid); 158 | //assert crawlerDocument != null; 159 | 160 | // load content from the network 161 | final long t = System.currentTimeMillis(); 162 | try { 163 | boolean success = false; 164 | if (url.startsWith("http")) success = loadHTTP(warcWriter, url, threadName, loaderHeadless); 165 | else if (url.startsWith("ftp")) loadFTP(warcWriter, url); 166 | else if (url.startsWith("smb")) loadSMB(warcWriter, url); 167 | 168 | // write success status 169 | if (success && crawlerDocument != null) { 170 | final long load_time = System.currentTimeMillis() - t; 171 | crawlerDocument.setStatus(Status.loaded).setStatusDate(new Date()).setComment("load time: " + load_time + " milliseconds"); 172 | // crawlerDocument.store(Data.gridIndex); we bulk-store this later 173 | // check with http://localhost:9200/crawler/_search?q=status_s:loaded 174 | } 175 | } catch (final IOException e) { 176 | // write fail status 177 | if (crawlerDocument != null) { 178 | final long load_time = System.currentTimeMillis() - t; 179 | crawlerDocument.setStatus(Status.load_failed).setStatusDate(new Date()).setComment("load fail: '" + e.getMessage() + "' after " + load_time + " milliseconds"); 180 | // crawlerDocument.store(Data.gridIndex); we bulk-store this later 181 | // check with http://localhost:9200/crawler/_search?q=status_s:load_failed 182 | } 183 | } 184 | } catch (final Throwable e) { 185 | Logger.warn("ContentLoader cannot load " + url + " - " + e.getMessage()); 186 | errors.put(url, ActionResult.FAIL_IRREVERSIBLE); 187 | } 188 | }); 189 | 190 | // bulk-store the crawler documents 191 | try { 192 | CrawlerDocument.storeBulk(Service.instance.config, Service.instance.config.gridIndex, crawlerDocuments); 193 | } catch (final Throwable e) { 194 | Logger.error(e); 195 | } 196 | return errors; 197 | } 198 | 199 | private static void loadFTP(final WarcWriter warcWriter, final String url) throws IOException { 200 | 201 | } 202 | 203 | private static void loadSMB(final WarcWriter warcWriter, final String url) throws IOException { 204 | 205 | } 206 | 207 | private static boolean loadHTTP(final WarcWriter warcWriter, final String url, final String threadName, final boolean useHeadlessLoader) throws IOException {// check short memory status 208 | final Date loaddate = new Date(); 209 | byte[] content = null; 210 | String requestHeaders = null; 211 | String responseHeaders = null; 212 | final MultiProtocolURL u = new MultiProtocolURL(url); 213 | 214 | if (useHeadlessLoader) { 215 | // using the headless loader only makes sense in certain situations: 216 | // we must make sure that the content is actually html, othwewise there is 217 | // no point in usage of the headless loader and we would fall back to normal loading. 218 | String ext = MultiProtocolURL.getFileExtension(u.getFileName()); 219 | boolean isHtml = Classification.isHtmlExtension(ext); 220 | 221 | // not all content that is actually html requires an text extension, we also check the mime type by using a head request 222 | if (!isHtml) { 223 | LoaderClientConnection ac = new LoaderClientConnection(url, true); 224 | String mime = ac.getMime(); 225 | isHtml = mime.endsWith("/html") || mime.endsWith("/xhtml+xml"); 226 | } 227 | 228 | // finally we use the headless loader to get the content 229 | if (isHtml) try { 230 | // use htmlunit to load this 231 | final HtmlUnitLoader htmlUnitLoader = new HtmlUnitLoader(url, threadName); 232 | final String xml = htmlUnitLoader.getXml(); 233 | 234 | requestHeaders = htmlUnitLoader.getRequestHeaders(); 235 | responseHeaders = htmlUnitLoader.getResponseHeaders(); 236 | 237 | // we consider that the resulting charset should be UTF_8 238 | content = xml.getBytes(StandardCharsets.UTF_8); 239 | 240 | // However, the original Content-Type may denote a different charset 241 | // Therefore we must patch that charset now in the response header 242 | Matcher matcher = charsetPattern.matcher(responseHeaders); 243 | if (matcher.find()) { 244 | String oldCharset = matcher.group(1); 245 | String newCharset = StandardCharsets.UTF_8.name(); 246 | if (!oldCharset.equals(newCharset)) { 247 | StringBuffer sb = new StringBuffer(); 248 | matcher.appendReplacement(sb, "charset=" + newCharset); 249 | matcher.appendTail(sb); 250 | responseHeaders = sb.toString(); 251 | } 252 | } 253 | } catch (final Throwable e) { 254 | // do nothing here, input stream is not set 255 | final String cause = e == null ? "null" : e.getMessage(); 256 | if (cause != null && cause.indexOf("404") >= 0) { 257 | throw new IOException("" + url + " fail: " + cause); 258 | } 259 | Logger.debug("Loader - HtmlUnit failed (will retry): " + cause); 260 | } 261 | } 262 | 263 | // Here we may not have loaded the content because of not-required headless loading or 264 | // because headless loading has failed. Do a normal loading: 265 | if (content == null) { 266 | // do another http request. This can either happen because mime type is not html 267 | // or it was html and HtmlUnit has failed - we retry the normal way here. 268 | 269 | LoaderClientConnection ac = new LoaderClientConnection(url, false); 270 | final int status = ac.getStatusCode(); 271 | if (status != 200) return false; 272 | 273 | requestHeaders = ac.getRequestHeader(); 274 | responseHeaders = ac.getResponseHeader(); 275 | 276 | content = ac.getContent(); 277 | } 278 | 279 | if (content == null || content.length == 0) return false; 280 | 281 | JwatWarcWriter.writeRequest(warcWriter, url, null, loaddate, null, null, requestHeaders.getBytes(StandardCharsets.UTF_8)); 282 | 283 | // add the request header before the content 284 | final ByteArrayOutputStream r = new ByteArrayOutputStream(); 285 | r.write(responseHeaders.getBytes(StandardCharsets.UTF_8)); 286 | r.write(content); 287 | content = r.toByteArray(); 288 | 289 | Logger.info("ContentLoader writing WARC for " + url + " - " + content.length + " bytes"); 290 | JwatWarcWriter.writeResponse(warcWriter, url, null, loaddate, null, null, content); 291 | 292 | return true; 293 | } 294 | 295 | private static String getTestWarcContent(String url, boolean loaderHeadless) { 296 | final byte[] warcPayload = "test".getBytes(StandardCharsets.UTF_8); 297 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 298 | try { 299 | WarcWriter warcWriter = ContentLoader.initWriter(out, warcPayload, false); 300 | loadHTTP(warcWriter, url, "test", loaderHeadless); 301 | warcWriter.close(); 302 | out.close(); 303 | String b = new String(out.toByteArray(), StandardCharsets.UTF_8); 304 | return b; 305 | } catch (IOException e) { 306 | e.printStackTrace(); 307 | } 308 | return ""; 309 | } 310 | 311 | public static void main(String[] args) { 312 | String url = "https://www.schulministerium.nrw.de/BiPo/SchuleAendern/msbleikaleistungen.html?katalogId=99088003034004"; 313 | 314 | String headless = getTestWarcContent(url, true); 315 | String normal = getTestWarcContent(url, false); 316 | System.out.println("headless:\n" + headless); 317 | System.out.println("\nnormal:\n" + normal); 318 | //System.out.println("Difference: " + StringUtils.difference(headless, normal)); // requires import org.apache.commons.lang3.StringUtils; 319 | System.exit(0); 320 | } 321 | 322 | } 323 | -------------------------------------------------------------------------------- /src/main/java/net/yacy/grid/loader/retrieval/FTPClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * FTPClient 3 | * Copyright 2002, 2004, 2006, 2010 by Michael Peter Christen 4 | * first published on http://yacy.net 5 | * main implementation finished: 28.05.2002 6 | * last major change: 06.05.2004 7 | * added html generation for directories: 5.9.2006 8 | * migrated to the cora package and re-licensed under lgpl: 23.08.2010 9 | * 10 | * This file is part of YaCy Content Integration 11 | * 12 | * This library is free software; you can redistribute it and/or 13 | * modify it under the terms of the GNU Lesser General Public 14 | * License as published by the Free Software Foundation; either 15 | * version 2.1 of the License, or (at your option) any later version. 16 | * 17 | * This library is distributed in the hope that it will be useful, 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | * Lesser General Public License for more details. 21 | * 22 | * You should have received a copy of the GNU Lesser General Public License 23 | * along with this program in the file lgpl21.txt 24 | * If not, see . 25 | */ 26 | 27 | package net.yacy.grid.loader.retrieval; 28 | 29 | import java.io.BufferedOutputStream; 30 | import java.io.BufferedReader; 31 | import java.io.ByteArrayOutputStream; 32 | import java.io.DataInputStream; 33 | import java.io.DataOutputStream; 34 | import java.io.File; 35 | import java.io.FileInputStream; 36 | import java.io.FileNotFoundException; 37 | import java.io.FileOutputStream; 38 | import java.io.IOException; 39 | import java.io.InputStream; 40 | import java.io.InputStreamReader; 41 | import java.io.OutputStream; 42 | import java.io.PrintStream; 43 | import java.io.RandomAccessFile; 44 | import java.lang.reflect.Array; 45 | import java.lang.reflect.InvocationTargetException; 46 | import java.lang.reflect.Method; 47 | import java.net.InetAddress; 48 | import java.net.InetSocketAddress; 49 | import java.net.ServerSocket; 50 | import java.net.Socket; 51 | import java.net.SocketException; 52 | import java.nio.charset.StandardCharsets; 53 | import java.text.DateFormat; 54 | import java.text.ParseException; 55 | import java.text.SimpleDateFormat; 56 | import java.util.ArrayList; 57 | import java.util.Calendar; 58 | import java.util.Date; 59 | import java.util.HashMap; 60 | import java.util.List; 61 | import java.util.Locale; 62 | import java.util.Map; 63 | import java.util.Properties; 64 | import java.util.StringTokenizer; 65 | import java.util.concurrent.BlockingQueue; 66 | import java.util.concurrent.LinkedBlockingQueue; 67 | import java.util.regex.Matcher; 68 | import java.util.regex.Pattern; 69 | 70 | import net.yacy.grid.tools.Domains; 71 | import net.yacy.grid.tools.Logger; 72 | 73 | public class FTPClient { 74 | 75 | public static final String ANONYMOUS = "anonymous"; 76 | 77 | private static final String vDATE = "20161222"; 78 | 79 | private boolean glob = true; // glob = false -> filenames are taken 80 | // literally for mget, .. 81 | 82 | // transfer type 83 | private static final char transferType = 'i'; // transfer binary 84 | 85 | // block size [1K by default] 86 | private static final int blockSize = 1024; 87 | 88 | // client socket for commands 89 | private Socket ControlSocket = null; 90 | 91 | // socket timeout 92 | private static final int ControlSocketTimeout = 10000; 93 | 94 | // data socket timeout 95 | private int DataSocketTimeout = 0; // in seconds (default infinite) 96 | 97 | // socket for data transactions 98 | private ServerSocket DataSocketActive = null; 99 | private Socket DataSocketPassive = null; 100 | private boolean DataSocketPassiveMode = true; 101 | 102 | // output and input streams for client control connection 103 | private BufferedReader clientInput = null; 104 | private DataOutputStream clientOutput = null; 105 | 106 | // client prompt 107 | private String prompt = "ftp [local]>"; 108 | 109 | String[] cmd; 110 | 111 | // session parameters 112 | File currentLocalPath; 113 | String account, password, host, remotemessage, remotegreeting, remotesystem; 114 | int port; 115 | 116 | // entry info cache 117 | private final Map infoCache = new HashMap(); 118 | 119 | // date-format in LIST (english month names) 120 | private static final SimpleDateFormat lsDateFormat = new SimpleDateFormat("MMM d y H:m", new Locale("en")); 121 | 122 | // TODO: implement RFC 2640 Internationalization 123 | 124 | public FTPClient() { 125 | 126 | this.currentLocalPath = new File(System.getProperty("user.dir")); 127 | try { 128 | this.currentLocalPath = new File(this.currentLocalPath.getCanonicalPath()); 129 | } catch (final IOException e) { 130 | } 131 | 132 | this.account = null; 133 | this.password = null; 134 | this.host = null; 135 | this.port = -1; 136 | this.remotemessage = null; 137 | this.remotegreeting = null; 138 | this.remotesystem = null; 139 | } 140 | 141 | public boolean exec(String command, final boolean promptIt) { 142 | if ((command == null) || (command.isEmpty())) { 143 | return true; 144 | } 145 | int pos; 146 | String com; 147 | boolean ret = true; 148 | while (command.length() > 0) { 149 | pos = command.indexOf(';',0); 150 | if (pos < 0) { 151 | pos = command.indexOf("\n",0); 152 | } 153 | if (pos < 0) { 154 | com = command; 155 | command = ""; 156 | } else { 157 | com = command.substring(0, pos); 158 | command = command.substring(pos + 1); 159 | } 160 | if (promptIt) { 161 | Logger.info(this.prompt + com); 162 | } 163 | this.cmd = line2args(com); 164 | try { 165 | ret = (((Boolean) getClass().getMethod(this.cmd[0].toUpperCase(), (Class[]) Array.newInstance(Class.class, 0)).invoke(this, (Object[]) Array.newInstance(Object.class, 0))) 166 | .booleanValue()); 167 | } catch (final InvocationTargetException e) { 168 | if (e.getMessage() != null) { 169 | if (notConnected()) { 170 | // the error was probably caused because there is no 171 | // connection 172 | Logger.warn("not connected. no effect.", e); 173 | } else { 174 | Logger.warn("ftp internal exception: target exception " + e); 175 | } 176 | return ret; 177 | } 178 | } catch (final IllegalAccessException e) { 179 | Logger.warn("ftp internal exception: wrong access " + e); 180 | return ret; 181 | } catch (final NoSuchMethodException e) { 182 | // consider first that the user attempted to execute a java 183 | // command from 184 | // the current path; either local or remote 185 | if (notConnected()) { 186 | // try a local exec 187 | try { 188 | javaexec(this.cmd); 189 | } catch (final Exception ee) { 190 | Logger.warn("Command '" + this.cmd[0] + "' not supported. Try 'HELP'."); 191 | } 192 | } else { 193 | // try a remote exec 194 | exec("java " + com, false); 195 | } 196 | return ret; 197 | } 198 | } 199 | return ret; 200 | } 201 | 202 | private String[] line2args(final String line) { 203 | // parse the command line 204 | if ((line == null) || (line.isEmpty())) { 205 | return null; 206 | } 207 | // pre-parse 208 | String line1 = ""; 209 | boolean quoted = false; 210 | for (int i = 0; i < line.length(); i++) { 211 | if (quoted) { 212 | if (line.charAt(i) == '"') { 213 | quoted = false; 214 | } else { 215 | line1 = line1 + line.charAt(i); 216 | } 217 | } else { 218 | if (line.charAt(i) == '"') { 219 | quoted = true; 220 | } else if (line.charAt(i) == ' ') { 221 | line1 = line1 + '|'; 222 | } else { 223 | line1 = line1 + line.charAt(i); 224 | } 225 | } 226 | } 227 | return line1.split("\\|"); 228 | } 229 | 230 | static class cl extends ClassLoader { 231 | 232 | public cl() { 233 | super(); 234 | } 235 | 236 | @Override 237 | public synchronized Class loadClass(final String classname, final boolean resolve) throws ClassNotFoundException { 238 | Class c = findLoadedClass(classname); 239 | if (c == null) { 240 | try { 241 | // second try: ask the system 242 | c = findSystemClass(classname); 243 | } catch (final ClassNotFoundException e) { 244 | // third try: load myself 245 | final File f = new File(System.getProperty("user.dir"), classname + ".class"); 246 | final int length = (int) f.length(); 247 | final byte[] classbytes = new byte[length]; 248 | DataInputStream in = null; 249 | try { 250 | in = new DataInputStream(new FileInputStream(f)); 251 | in.readFully(classbytes); 252 | c = defineClass(classname, classbytes, 0, classbytes.length); 253 | } catch (final FileNotFoundException ee) { 254 | throw new ClassNotFoundException(); 255 | } catch (final IOException ee) { 256 | throw new ClassNotFoundException(); 257 | } finally { 258 | try { 259 | in.close(); 260 | } catch (final IOException ioe) { 261 | Logger.warn("Could not close input stream on file " + f); 262 | } 263 | } 264 | } 265 | } 266 | if (resolve) { 267 | resolveClass(c); 268 | } 269 | return c; 270 | } 271 | 272 | } 273 | 274 | private void javaexec(final String[] inArgs) { 275 | final String obj = inArgs[0]; 276 | final String[] args = new String[inArgs.length - 1]; 277 | 278 | // remove the object name from the array of arguments 279 | System.arraycopy(inArgs, 1, args, 0, inArgs.length - 1); 280 | 281 | // Build the argument list for invoke() method. 282 | final Object[] argList = new Object[1]; 283 | argList[0] = args; 284 | 285 | final Properties pr = System.getProperties(); 286 | final String origPath = (String) pr.get("java.class.path"); 287 | try { 288 | 289 | // set the user.dir to the actual local path 290 | pr.put("user.dir", this.currentLocalPath.toString()); 291 | 292 | // add the current path to the classpath 293 | // pr.put("java.class.path", "" + pr.get("user.dir") + 294 | // pr.get("path.separator") + origPath); 295 | 296 | // Logger.warning("System Properties: " + pr.toString()); 297 | 298 | System.setProperties(pr); 299 | 300 | // locate object 301 | final Class c = (new cl()).loadClass(obj); 302 | // Class c = this.getClass().getClassLoader().loadClass(obj); 303 | 304 | // locate public static main(String[]) method 305 | final Class[] parameterType = (Class[]) Array.newInstance(Class.class, 1); 306 | parameterType[0] = Class.forName("[Ljava.lang.String;"); 307 | Method m = c.getMethod("main", parameterType); 308 | 309 | // invoke object.main() 310 | final Object result = m.invoke(null, argList); 311 | //parameterType = null; 312 | m = null; 313 | 314 | // handle result 315 | if (result != null) { 316 | Logger.info("returns " + result); 317 | } 318 | 319 | // set the local path to the user.dir (which may have changed) 320 | this.currentLocalPath = new File((String) pr.get("user.dir")); 321 | 322 | } catch (final ClassNotFoundException e) { 323 | // Logger.warning("cannot find class file " + obj + 324 | // ".class"); 325 | // class file does not exist, go silently over it to not show 326 | // everybody that the 327 | // system attempted to load a class file 328 | Logger.warn("Command '" + obj + "' not supported. Try 'HELP'."); 329 | } catch (final NoSuchMethodException e) { 330 | Logger.warn("no \"public static main(String args[])\" in " + obj); 331 | } catch (final InvocationTargetException e) { 332 | final Throwable orig = e.getTargetException(); 333 | if (orig.getMessage() != null) { 334 | Logger.warn("Exception from " + obj + ": " + orig.getMessage(), orig); 335 | } 336 | } catch (final IllegalAccessException e) { 337 | Logger.warn("Illegal access for " + obj + ": class is probably not declared as public", e); 338 | } catch (final NullPointerException e) { 339 | Logger.warn("main(String args[]) is not defined as static for " + obj); 340 | /* 341 | * } catch (final IOException e) { // class file does not exist, go 342 | * silently over it to not show everybody that the // system 343 | * attempted to load a class file Logger.warning("Command '" + obj + "' 344 | * not supported. Try 'HELP'."); 345 | */ 346 | } catch (final Exception e) { 347 | Logger.warn("Exception caught: ", e); 348 | } 349 | 350 | // set the classpath to its original definition 351 | pr.put("java.class.path", origPath); 352 | 353 | } 354 | 355 | // FTP CLIENT COMMANDS ------------------------------------ 356 | 357 | public boolean ASCII() { 358 | if (this.cmd.length != 1) { 359 | Logger.warn("Syntax: ASCII (no parameter)"); 360 | return true; 361 | } 362 | try { 363 | literal("TYPE A"); 364 | } catch (final IOException e) { 365 | Logger.warn("Error: ASCII transfer type not supported by server."); 366 | } 367 | return true; 368 | } 369 | 370 | public boolean BINARY() { 371 | if (this.cmd.length != 1) { 372 | Logger.warn("Syntax: BINARY (no parameter)"); 373 | return true; 374 | } 375 | try { 376 | literal("TYPE I"); 377 | } catch (final IOException e) { 378 | Logger.warn("Error: BINARY transfer type not supported by server."); 379 | } 380 | return true; 381 | } 382 | 383 | public boolean BYE() { 384 | return QUIT(); 385 | } 386 | 387 | public boolean CD() { 388 | if (this.cmd.length != 2) { 389 | Logger.warn("Syntax: CD "); 390 | return true; 391 | } 392 | if (notConnected()) { 393 | return LCD(); 394 | } 395 | try { 396 | // send cwd command 397 | send("CWD " + this.cmd[1]); 398 | 399 | final String reply = receive(); 400 | if (isNotPositiveCompletion(reply)) { 401 | throw new IOException(reply); 402 | } 403 | } catch (final IOException e) { 404 | Logger.warn("Error: change of working directory to path " + this.cmd[1] + " failed."); 405 | } 406 | return true; 407 | } 408 | 409 | public boolean CLOSE() { 410 | return DISCONNECT(); 411 | } 412 | 413 | private void rmForced(final String path) throws IOException { 414 | // first try: send DELE command (to delete a file) 415 | send("DELE " + path); 416 | // read reply 417 | final String reply1 = receive(); 418 | if (isNotPositiveCompletion(reply1)) { 419 | // second try: send a RMD command (to delete a directory) 420 | send("RMD " + path); 421 | // read reply 422 | final String reply2 = receive(); 423 | if (isNotPositiveCompletion(reply2)) { 424 | // third try: test if this thing is a directory or file and send 425 | // appropriate error message 426 | if (isFolder(path)) { 427 | throw new IOException(reply2); 428 | } 429 | throw new IOException(reply1); 430 | } 431 | } 432 | } 433 | 434 | /** 435 | * @param path 436 | * @return date of entry on ftp-server or now if date can not be obtained 437 | */ 438 | public Date entryDate(final String path) { 439 | final entryInfo info = fileInfo(path); 440 | Date date = null; 441 | if (info != null) { 442 | date = info.date; 443 | } 444 | return date; 445 | } 446 | 447 | public boolean DEL() { 448 | if (this.cmd.length != 2) { 449 | Logger.warn("Syntax: DEL "); 450 | return true; 451 | } 452 | if (notConnected()) { 453 | return LDEL(); 454 | } 455 | try { 456 | rmForced(this.cmd[1]); 457 | } catch (final IOException e) { 458 | Logger.warn("Error: deletion of file " + this.cmd[1] + " failed."); 459 | } 460 | return true; 461 | } 462 | 463 | public boolean RM() { 464 | return DEL(); 465 | } 466 | 467 | public boolean DIR() { 468 | if (this.cmd.length > 2) { 469 | Logger.warn("Syntax: DIR [|]"); 470 | return true; 471 | } 472 | if (notConnected()) { 473 | return LDIR(); 474 | } 475 | try { 476 | List l; 477 | if (this.cmd.length == 2) { 478 | l = list(this.cmd[1], false); 479 | } else { 480 | l = list(".", false); 481 | } 482 | printElements(l); 483 | } catch (final IOException e) { 484 | Logger.warn("Error: remote list not available (1): " + e.getMessage()); 485 | } 486 | return true; 487 | } 488 | 489 | public boolean DISCONNECT() { 490 | try { 491 | quit(); 492 | Logger.info("---- Connection closed."); 493 | } catch (final IOException e) { 494 | // Connection to server lost 495 | // do not append any error to errPrintln because we can silently go over this error 496 | // otherwise the client treats this case as an error and does not accept the result of the session 497 | } 498 | try { 499 | closeConnection(); 500 | } catch (final IOException e) { 501 | this.ControlSocket = null; 502 | this.DataSocketActive = null; 503 | this.DataSocketPassive = null; 504 | this.clientInput = null; 505 | this.clientOutput = null; 506 | } 507 | this.prompt = "ftp [local]>"; 508 | return true; 509 | } 510 | 511 | private String quit() throws IOException { 512 | 513 | send("QUIT"); 514 | 515 | // read status reply 516 | final String reply = receive(); 517 | if (isNotPositiveCompletion(reply)) { 518 | throw new IOException(reply); 519 | } 520 | 521 | closeConnection(); 522 | 523 | return reply; 524 | } 525 | 526 | public boolean EXIT() { 527 | return QUIT(); 528 | } 529 | 530 | public boolean GET() { 531 | if ((this.cmd.length < 2) || (this.cmd.length > 3)) { 532 | Logger.warn("Syntax: GET []"); 533 | return true; 534 | } 535 | final String remote = this.cmd[1]; // (new File(cmd[1])).getName(); 536 | final boolean withoutLocalFile = this.cmd.length == 2; 537 | 538 | final String localFilename = (withoutLocalFile) ? remote : this.cmd[2]; 539 | final File local = absoluteLocalFile(localFilename); 540 | 541 | if (local.exists()) { 542 | Logger.warn("Error: local file " + local.toString() + " already exists.\n" + " File " + remote 543 | + " not retrieved. Local file unchanged."); 544 | } else { 545 | if (withoutLocalFile) { 546 | retrieveFilesRecursively(remote, false); 547 | } else { 548 | try { 549 | get(local.getAbsolutePath(), remote); 550 | } catch (final IOException e) { 551 | Logger.warn("Error: retrieving file " + remote + " failed. (" + e.getMessage() + ")"); 552 | } 553 | } 554 | } 555 | return true; 556 | } 557 | 558 | /** 559 | * @param localFilename 560 | * @return 561 | */ 562 | private File absoluteLocalFile(final String localFilename) { 563 | File local; 564 | final File l = new File(localFilename); 565 | if (l.isAbsolute()) { 566 | local = l; 567 | } else { 568 | local = new File(this.currentLocalPath, localFilename); 569 | } 570 | return local; 571 | } 572 | 573 | private void retrieveFilesRecursively(final String remote, final boolean delete) { 574 | final File local = absoluteLocalFile(remote); 575 | try { 576 | get(local.getAbsolutePath(), remote); 577 | try { 578 | if (delete) { 579 | rmForced(remote); 580 | } 581 | } catch (final IOException eee) { 582 | Logger.warn("Warning: remote file or path " + remote + " cannot be removed."); 583 | } 584 | } catch (final IOException e) { 585 | if (e.getMessage().startsWith("550")) { 586 | // maybe it's a "not a plain file" error message", then it can 587 | // be a folder 588 | // test if this exists (then it should be a folder) 589 | if (isFolder(remote)) { 590 | // copy the whole directory 591 | exec("cd \"" + remote + "\";lmkdir \"" + remote + "\";lcd \"" + remote + "\"", true); 592 | // exec("mget *",true); 593 | try { 594 | for (final String element : list(".", false)) { 595 | retrieveFilesRecursively(element, delete); 596 | } 597 | } catch (final IOException ee) { 598 | } 599 | exec("cd ..;lcd ..", true); 600 | try { 601 | if (delete) { 602 | rmForced(remote); 603 | } 604 | } catch (final IOException eee) { 605 | Logger.warn("Warning: remote file or path " + remote + " cannot be removed."); 606 | } 607 | } else { 608 | Logger.warn("Error: remote file or path " + remote + " does not exist."); 609 | } 610 | } else { 611 | Logger.warn("Error: retrieving file " + remote + " failed. (" + e.getMessage() + ")"); 612 | } 613 | } 614 | } 615 | 616 | /** 617 | * checks if path is a folder 618 | * 619 | * @param path 620 | * @return true if ftp-server changes to path 621 | */ 622 | public boolean isFolder(final String path) { 623 | try { 624 | // /// try to parse LIST output (1 command) 625 | final entryInfo info = fileInfo(path); 626 | if (info != null) { 627 | return info.type == filetype.directory; 628 | } 629 | 630 | // /// try to change to folder (4 commands) 631 | // current folder 632 | final String currentFolder = pwd(); 633 | // check if we can change to folder 634 | send("CWD " + path); 635 | final String reply = receive(); 636 | if (isNotPositiveCompletion(reply)) { 637 | throw new IOException(reply); 638 | } 639 | // check if we actually changed into the folder 640 | final String changedPath = pwd(); 641 | if (!(changedPath.equals(path) || changedPath.equals(currentFolder 642 | + (currentFolder.endsWith("/") ? "" : "/") + path))) { 643 | throw new IOException("folder is '" + changedPath + "' should be '" + path + "'"); 644 | } 645 | // return to last folder 646 | send("CWD " + currentFolder); 647 | /*reply =*/ receive(); 648 | return true; 649 | } catch (final IOException e) { 650 | return false; 651 | } 652 | } 653 | 654 | public boolean GLOB() { 655 | if (this.cmd.length != 1) { 656 | Logger.warn("Syntax: GLOB (no parameter)"); 657 | return true; 658 | } 659 | this.glob = !this.glob; 660 | Logger.info("---- globbing is now turned " + ((this.glob) ? "ON" : "OFF")); 661 | return true; 662 | } 663 | 664 | public boolean HASH() { 665 | Logger.warn("no games implemented"); 666 | return true; 667 | } 668 | 669 | /* 670 | * private static String[] shift(String args[]) { if ((args == null) || 671 | * (args.length == 0)) return args; else { String[] newArgs = new 672 | * String[args.length-1]; System.arraycopy(args, 1, newArgs, 0, 673 | * args.length-1); return newArgs; } } public boolean JAR() { //Sun 674 | * proprietary API may be removed in a future Java release 675 | * sun.tools.jar.Main.main(shift(cmd)); return true; } 676 | */ 677 | 678 | public boolean JJENCODE() { 679 | if (this.cmd.length != 2) { 680 | Logger.warn("Syntax: JJENCODE "); 681 | return true; 682 | } 683 | final String path = this.cmd[1]; 684 | 685 | final File dir = new File(path); 686 | final File newPath = dir.isAbsolute() ? dir : new File(this.currentLocalPath, path); 687 | if (newPath.exists()) { 688 | if (newPath.isDirectory()) { 689 | // exec("cd \"" + remote + "\";lmkdir \"" + remote + "\";lcd \"" 690 | // + remote + "\"",true); 691 | /* 692 | * if not exist %1\nul goto :error cd %1 c:\jdk1.2.2\bin\jar 693 | * -cfM0 ..\%1.jar *.* cd .. c:\jdk1.2.2\bin\jar -cfM %1.jj 694 | * %1.jar del %1.jar 695 | */ 696 | String s = ""; 697 | final String[] l = newPath.list(); 698 | for (final String element : l) { 699 | s = s + " \"" + element + "\""; 700 | } 701 | exec("cd \"" + path + "\";jar -cfM0 ../\"" + path + ".jar\"" + s, true); 702 | exec("cd ..;jar -cfM \"" + path + ".jj\" \"" + path + ".jar\"", true); 703 | exec("rm \"" + path + ".jar\"", true); 704 | } else { 705 | Logger.warn("Error: local path " + newPath.toString() + " denotes not to a directory."); 706 | } 707 | } else { 708 | Logger.warn("Error: local path " + newPath.toString() + " does not exist."); 709 | } 710 | return true; 711 | } 712 | 713 | public boolean JJDECODE() { 714 | if (this.cmd.length != 2) { 715 | Logger.warn("Syntax: JJENCODE "); 716 | return true; 717 | } 718 | final String path = this.cmd[1]; 719 | final File dir = new File(path); 720 | final File newPath = dir.isAbsolute() ? dir : new File(this.currentLocalPath, path); 721 | final File newFolder = new File(newPath.toString() + ".dir"); 722 | if (newPath.exists()) { 723 | if (!newPath.isDirectory()) { 724 | if (!newFolder.mkdir()) { 725 | /* 726 | * if not exist %1.jj goto :error mkdir %1.dir copy %1.jj 727 | * %1.dir\ > %1.dummy && del %1.dummy cd %1.dir 728 | * c:\jdk1.2.2\bin\jar -xf %1.jj del %1.jj 729 | * c:\jdk1.2.2\bin\jar -xf %1.jar del %1.jar cd .. 730 | */ 731 | exec("mkdir \"" + path + ".dir\"", true); 732 | 733 | } else { 734 | Logger.warn("Error: target dir " + newFolder.toString() + " cannot be created"); 735 | } 736 | } else { 737 | Logger.warn("Error: local path " + newPath.toString() + " must denote to jar/jar file"); 738 | } 739 | } else { 740 | Logger.warn("Error: local path " + newPath.toString() + " does not exist."); 741 | } 742 | return true; 743 | } 744 | 745 | private static String[] argList2StringArray(final String argList) { 746 | return argList.split("\\s"); 747 | } 748 | 749 | public boolean JOIN(String[] args) { 750 | 751 | // make sure the specified dest file does not exist 752 | final String dest_name = args[1]; 753 | final File dest_file = new File(dest_name); 754 | if (dest_file.exists()) { 755 | Logger.warn("join: destination file " + dest_name + " already exists"); 756 | return true; 757 | } 758 | 759 | // prepare or search file names of the input files to be joined 760 | String source_name; 761 | File source_file; 762 | int pc = -1; 763 | // create new string array with file names 764 | // scan first for the files 765 | pc = 0; 766 | source_name = dest_name + ".000"; 767 | String argString = ""; 768 | source_file = new File(source_name); 769 | while ((source_file.exists()) && (source_file.isFile()) && (source_file.canRead())) { 770 | argString = argString + " " + source_name; 771 | pc++; 772 | source_name = dest_name + (pc < 10 ? ".00" + pc : (pc < 100 ? ".0" + pc : "." + pc)); 773 | source_file = new File(source_name); 774 | } 775 | args = argList2StringArray(argString.substring(1)); 776 | 777 | // do the join 778 | FileOutputStream dest = null; 779 | FileInputStream source = null; 780 | byte[] buffer; 781 | int bytes_read = 0; 782 | 783 | try { 784 | // open output file 785 | dest = new FileOutputStream(dest_file); 786 | buffer = new byte[1024]; 787 | 788 | // append all source files 789 | for (pc = 0; pc < args.length; pc++) { 790 | // open the source file 791 | source_name = args[pc]; 792 | source_file = new File(source_name); 793 | source = new FileInputStream(source_file); 794 | 795 | // start with the copy of one source file 796 | while (true) { 797 | bytes_read = source.read(buffer); 798 | if (bytes_read == -1) { 799 | break; 800 | } 801 | dest.write(buffer, 0, bytes_read); 802 | } 803 | 804 | // copy finished. close source file 805 | try { 806 | source.close(); 807 | } catch (final IOException e) { 808 | } 809 | } 810 | // close the output file 811 | try { 812 | dest.close(); 813 | } catch (final IOException e) { 814 | } 815 | 816 | // if we come to this point then everything went fine 817 | // if the user wanted to delete the source it is save to do so now 818 | for (pc = 0; pc < args.length; pc++) { 819 | try { 820 | if (!(new File(args[pc])).delete()) { 821 | Logger.warn("join: unable to delete file " + args[pc]); 822 | } 823 | } catch (final SecurityException e) { 824 | Logger.warn("join: no permission to delete file " + args[pc]); 825 | } 826 | } 827 | } catch (final FileNotFoundException e) { 828 | } catch (final IOException e) { 829 | } 830 | 831 | // clean up 832 | finally { 833 | // close any opened streams 834 | if (dest != null) { 835 | try { 836 | dest.close(); 837 | } catch (final IOException e) { 838 | } 839 | } 840 | if (source != null) { 841 | try { 842 | source.close(); 843 | } catch (final IOException e) { 844 | } 845 | } 846 | 847 | // print appropriate message 848 | Logger.warn("join created output from " + args.length + " source files"); 849 | } 850 | return true; 851 | } 852 | 853 | public boolean COPY(final String[] args) { 854 | final File dest_file = new File(args[2]); 855 | if (dest_file.exists()) { 856 | Logger.warn("copy: destination file " + args[2] + " already exists"); 857 | return true; 858 | } 859 | int bytes_read = 0; 860 | FileOutputStream dest = null; 861 | FileInputStream source = null; 862 | try { 863 | // open output file 864 | dest = new FileOutputStream(dest_file); 865 | final byte[] buffer = new byte[1024]; 866 | 867 | // open the source file 868 | final File source_file = new File(args[1]); 869 | source = new FileInputStream(source_file); 870 | 871 | // start with the copy of one source file 872 | while (true) { 873 | bytes_read = source.read(buffer); 874 | if (bytes_read == -1) { 875 | break; 876 | } 877 | dest.write(buffer, 0, bytes_read); 878 | } 879 | 880 | } catch (final FileNotFoundException e) { 881 | } catch (final IOException e) { 882 | } finally { 883 | // copy finished. close source file 884 | if (source != null) { 885 | try { 886 | source.close(); 887 | } catch (final IOException e) { 888 | } 889 | } 890 | 891 | // close the output file 892 | if (dest != null) { 893 | try { 894 | dest.close(); 895 | } catch (final IOException e) { 896 | } 897 | } 898 | } 899 | return true; 900 | } 901 | 902 | public boolean JAVA() { 903 | String s = "JAVA"; 904 | for (int i = 1; i < this.cmd.length; i++) { 905 | s = s + " " + this.cmd[i]; 906 | } 907 | try { 908 | send(s); 909 | /* String reply = */receive(); 910 | } catch (final IOException e) { 911 | } 912 | return true; 913 | } 914 | 915 | public boolean LCD() { 916 | if (this.cmd.length != 2) { 917 | Logger.warn("Syntax: LCD "); 918 | return true; 919 | } 920 | final String path = this.cmd[1]; 921 | final File dir = new File(path); 922 | File newPath = dir.isAbsolute() ? dir : new File(this.currentLocalPath, path); 923 | try { 924 | newPath = new File(newPath.getCanonicalPath()); 925 | } catch (final IOException e) { 926 | } 927 | if (newPath.exists()) { 928 | if (newPath.isDirectory()) { 929 | this.currentLocalPath = newPath; 930 | Logger.info("---- New local path: " + this.currentLocalPath.toString()); 931 | } else { 932 | Logger.warn("Error: local path " + newPath.toString() + " denotes not a directory."); 933 | } 934 | } else { 935 | Logger.warn("Error: local path " + newPath.toString() + " does not exist."); 936 | } 937 | return true; 938 | } 939 | 940 | public boolean LDEL() { 941 | return LRM(); 942 | } 943 | 944 | public boolean LDIR() { 945 | if (this.cmd.length != 1) { 946 | Logger.warn("Syntax: LDIR (no parameter)"); 947 | return true; 948 | } 949 | final String[] name = this.currentLocalPath.list(); 950 | for (final String element : name) { 951 | Logger.info(ls(new File(this.currentLocalPath, element))); 952 | } 953 | return true; 954 | } 955 | 956 | /** 957 | * parse LIST of file 958 | * 959 | * @param path 960 | * on ftp-server 961 | * @return null if info cannot be determined or error occures 962 | */ 963 | public entryInfo fileInfo(final String path) { 964 | if (this.infoCache.containsKey(path)) { 965 | return this.infoCache.get(path); 966 | } 967 | try { 968 | /* 969 | * RFC959 page 33f: If the argument is a pathname, the command is 970 | * analogous to the "list" command except that data shall be 971 | * transferred over the control connection. 972 | */ 973 | send("STAT " + path); 974 | 975 | final String reply = receive(); 976 | if (isNotPositiveCompletion(reply)) { 977 | throw new IOException(reply); 978 | } 979 | 980 | // check if reply is correct multi-line reply 981 | final String[] lines = reply.split("\\r\\n"); 982 | if (lines.length < 3) { 983 | throw new IOException(reply); 984 | } 985 | final int startCode = getStatusCode(lines[0]); 986 | final int endCode = getStatusCode(lines[lines.length - 1]); 987 | if (startCode != endCode) { 988 | throw new IOException(reply); 989 | } 990 | 991 | // first line which gives a result is taken (should be only one) 992 | entryInfo info = null; 993 | final int endFor = lines.length - 1; 994 | for (int i = 1; i < endFor; i++) { 995 | info = parseListData(lines[i]); 996 | if (info != null) { 997 | this.infoCache.put(path, info); 998 | break; 999 | } 1000 | } 1001 | return info; 1002 | } catch (final IOException e) { 1003 | return null; 1004 | } 1005 | } 1006 | 1007 | /** 1008 | * returns status of reply 1009 | * 1010 | * 1 Positive Preliminary reply 2 Positive Completion reply 3 Positive 1011 | * Intermediate reply 4 Transient Negative Completion reply 5 Permanent 1012 | * Negative Completion reply 1013 | * 1014 | * @param reply 1015 | * @return first digit of the reply code 1016 | */ 1017 | private int getStatus(final String reply) { 1018 | return Integer.parseInt(reply.substring(0, 1)); 1019 | } 1020 | 1021 | /** 1022 | * gives reply code 1023 | * 1024 | * @param reply 1025 | * @return 1026 | */ 1027 | private int getStatusCode(final String reply) { 1028 | return Integer.parseInt(reply.substring(0, 3)); 1029 | } 1030 | 1031 | /** 1032 | * checks if status code is in group 2 ("2xx message") 1033 | * 1034 | * @param reply 1035 | * @return 1036 | */ 1037 | private boolean isNotPositiveCompletion(final String reply) { 1038 | return getStatus(reply) != 2; 1039 | } 1040 | 1041 | private final static Pattern lsStyle = Pattern.compile("^([-\\w]{10}).\\s*\\d+\\s+[-\\w]+\\s+[-\\w]+\\s+(\\d+)\\s+(\\w{3})\\s+(\\d+)\\s+(\\d+:?\\d*)\\s+(.*)$"); 1042 | 1043 | /** 1044 | * parses output of LIST from ftp-server currently UNIX ls-style only, ie: 1045 | * -rw-r--r-- 1 root other 531 Jan 29 03:26 README dr-xr-xr-x 2 root 512 Apr 1046 | * 8 1994 etc 1047 | * 1048 | * @param line 1049 | * @return null if not parseable 1050 | */ 1051 | private static entryInfo parseListData(final String line) { 1052 | // groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name 1053 | final Matcher tokens = lsStyle.matcher(line); 1054 | if (tokens.matches() && tokens.groupCount() == 6) { 1055 | filetype type = filetype.file; 1056 | if (tokens.group(1).startsWith("d")) type = filetype.directory; 1057 | if (tokens.group(1).startsWith("l")) type = filetype.link; 1058 | long size = -1; 1059 | try { 1060 | size = Long.parseLong(tokens.group(2)); 1061 | } catch (final NumberFormatException e) { 1062 | Logger.warn("not a number in list-entry: ", e); 1063 | return null; 1064 | } 1065 | String time; 1066 | String year; 1067 | if (tokens.group(5).contains(":")) { 1068 | time = tokens.group(5); 1069 | year = String.valueOf(Calendar.getInstance().get(Calendar.YEAR)); // current 1070 | // year 1071 | } else { 1072 | time = "00:00"; 1073 | year = tokens.group(5); 1074 | } 1075 | // construct date string 1076 | // this has to be done, because the list-entry may have multiple 1077 | // spaces, tabs or so 1078 | Date date; 1079 | final String dateString = tokens.group(3) + " " + tokens.group(4) + " " + year + " " + time; 1080 | try { 1081 | synchronized(lsDateFormat) { 1082 | date = lsDateFormat.parse(dateString); 1083 | } 1084 | } catch (final ParseException e) { 1085 | Logger.warn("---- Error: not ls date-format '" + dateString, e); 1086 | date = new Date(); 1087 | } 1088 | final String filename = tokens.group(6); 1089 | return new entryInfo(type, size, date, filename); 1090 | } 1091 | return null; 1092 | } 1093 | 1094 | 1095 | public static final entryInfo POISON_entryInfo = new entryInfo(); 1096 | 1097 | public static enum filetype { 1098 | file, link, directory; 1099 | } 1100 | 1101 | /** 1102 | * parameter class 1103 | * 1104 | * @author danielr 1105 | * @since 2008-03-13 r4558 1106 | */ 1107 | public static class entryInfo { 1108 | /** 1109 | * file type 1110 | */ 1111 | public final filetype type; 1112 | /** 1113 | * size in bytes 1114 | */ 1115 | public final long size; 1116 | /** 1117 | * date of file 1118 | */ 1119 | public final Date date; 1120 | /** 1121 | * name of entry 1122 | */ 1123 | public String name; 1124 | 1125 | public entryInfo() { 1126 | this.type = filetype.file; 1127 | this.size = -1; 1128 | this.date = null; 1129 | this.name = null; 1130 | } 1131 | 1132 | /** 1133 | * constructor 1134 | * 1135 | * @param isDir 1136 | * @param size 1137 | * bytes 1138 | * @param date 1139 | * @param name 1140 | */ 1141 | public entryInfo(final filetype type, final long size, final Date date, final String name) { 1142 | this.type = type; 1143 | this.size = size; 1144 | this.date = date; 1145 | this.name = name; 1146 | } 1147 | 1148 | /* 1149 | * (non-Javadoc) 1150 | * 1151 | * @see java.lang.Object#toString() 1152 | */ 1153 | @Override 1154 | public String toString() { 1155 | final StringBuilder info = new StringBuilder(100); 1156 | info.append(this.name); 1157 | info.append(" (type="); 1158 | info.append(this.type.name()); 1159 | info.append(", size="); 1160 | info.append(this.size); 1161 | info.append(", "); 1162 | info.append(this.date); 1163 | info.append(")"); 1164 | return info.toString(); 1165 | } 1166 | } 1167 | 1168 | private String ls(final File inode) { 1169 | if ((inode == null) || (!inode.exists())) { 1170 | return ""; 1171 | } 1172 | String s = ""; 1173 | if (inode.isDirectory()) { 1174 | s = s + "d"; 1175 | } else if (inode.isFile()) { 1176 | s = s + "-"; 1177 | } else { 1178 | s = s + "?"; 1179 | } 1180 | if (inode.canRead()) { 1181 | s = s + "r"; 1182 | } else { 1183 | s = s + "-"; 1184 | } 1185 | if (inode.canWrite()) { 1186 | s = s + "w"; 1187 | } else { 1188 | s = s + "-"; 1189 | } 1190 | s = s + " " + lenformatted(Long.toString(inode.length()), 9); 1191 | final DateFormat df = DateFormat.getDateTimeInstance(); 1192 | s = s + " " + df.format(new Date(inode.lastModified())); 1193 | s = s + " " + inode.getName(); 1194 | if (inode.isDirectory()) { 1195 | s = s + "/"; 1196 | } 1197 | return s; 1198 | } 1199 | 1200 | private String lenformatted(String s, int l) { 1201 | l = l - s.length(); 1202 | while (l > 0) { 1203 | s = " " + s; 1204 | l--; 1205 | } 1206 | return s; 1207 | } 1208 | 1209 | public boolean LITERAL() { 1210 | if (this.cmd.length == 1) { 1211 | Logger.warn("Syntax: LITERAL [] (see RFC959)"); 1212 | return true; 1213 | } 1214 | String s = ""; 1215 | for (int i = 1; i < this.cmd.length; i++) { 1216 | s = s + " " + this.cmd[i]; 1217 | } 1218 | try { 1219 | literal(s.substring(1)); 1220 | } catch (final IOException e) { 1221 | Logger.warn("Error: Syntax of FTP-command wrong. See RFC959 for details."); 1222 | } 1223 | return true; 1224 | } 1225 | 1226 | public boolean LLS() { 1227 | return LDIR(); 1228 | } 1229 | 1230 | public boolean LMD() { 1231 | return LMKDIR(); 1232 | } 1233 | 1234 | public boolean LMKDIR() { 1235 | if (this.cmd.length != 2) { 1236 | Logger.warn("Syntax: LMKDIR "); 1237 | return true; 1238 | } 1239 | final File f = new File(this.currentLocalPath, this.cmd[1]); 1240 | if (f.exists()) { 1241 | Logger.warn("Error: local file/folder " + this.cmd[1] + " already exists"); 1242 | } else { 1243 | if (!f.mkdir()) { 1244 | Logger.warn("Error: creation of local folder " + this.cmd[1] + " failed"); 1245 | } 1246 | } 1247 | return true; 1248 | } 1249 | 1250 | public boolean LMV() { 1251 | if (this.cmd.length != 3) { 1252 | Logger.warn("Syntax: LMV "); 1253 | return true; 1254 | } 1255 | final File from = new File(this.cmd[1]); 1256 | final File to = new File(this.cmd[2]); 1257 | if (!to.exists()) { 1258 | if (from.renameTo(to)) { 1259 | Logger.info("---- \"" + from.toString() + "\" renamed to \"" + to.toString() + "\""); 1260 | } else { 1261 | Logger.warn("rename failed"); 1262 | } 1263 | } else { 1264 | Logger.warn("\"" + to.toString() + "\" already exists"); 1265 | } 1266 | return true; 1267 | } 1268 | 1269 | public boolean LPWD() { 1270 | if (this.cmd.length != 1) { 1271 | Logger.warn("Syntax: LPWD (no parameter)"); 1272 | return true; 1273 | } 1274 | Logger.info("---- Local path: " + this.currentLocalPath.toString()); 1275 | return true; 1276 | } 1277 | 1278 | public boolean LRD() { 1279 | return LMKDIR(); 1280 | } 1281 | 1282 | public boolean LRMDIR() { 1283 | if (this.cmd.length != 2) { 1284 | Logger.warn("Syntax: LRMDIR "); 1285 | return true; 1286 | } 1287 | final File f = new File(this.currentLocalPath, this.cmd[1]); 1288 | if (!f.exists()) { 1289 | Logger.warn("Error: local folder " + this.cmd[1] + " does not exist"); 1290 | } else { 1291 | if (!f.delete()) { 1292 | Logger.warn("Error: deletion of local folder " + this.cmd[1] + " failed"); 1293 | } 1294 | } 1295 | return true; 1296 | } 1297 | 1298 | public boolean LRM() { 1299 | if (this.cmd.length != 2) { 1300 | Logger.warn("Syntax: LRM "); 1301 | return true; 1302 | } 1303 | final File f = new File(this.currentLocalPath, this.cmd[1]); 1304 | if (!f.exists()) { 1305 | Logger.warn("Error: local file " + this.cmd[1] + " does not exist"); 1306 | } else { 1307 | if (!f.delete()) { 1308 | Logger.warn("Error: deletion of file " + this.cmd[1] + " failed"); 1309 | } 1310 | } 1311 | return true; 1312 | } 1313 | 1314 | public boolean LS() { 1315 | if (this.cmd.length > 2) { 1316 | Logger.warn("Syntax: LS [|]"); 1317 | return true; 1318 | } 1319 | if (notConnected()) { 1320 | return LLS(); 1321 | } 1322 | try { 1323 | List l; 1324 | if (this.cmd.length == 2) { 1325 | l = list(this.cmd[1], true); 1326 | } else { 1327 | l = list(".", true); 1328 | } 1329 | printElements(l); 1330 | } catch (final IOException e) { 1331 | Logger.warn("Error: remote list not available (2): " + e.getMessage()); 1332 | } 1333 | return true; 1334 | } 1335 | 1336 | /** 1337 | * @param list 1338 | */ 1339 | private void printElements(final List list) { 1340 | Logger.info("---- v---v---v---v---v---v---v---v---v---v---v---v---v---v---v---v---v---v---v"); 1341 | for (final String element : list) { 1342 | Logger.info(element); 1343 | } 1344 | Logger.info("---- ^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^"); 1345 | } 1346 | 1347 | public List list(final String path, final boolean extended) throws IOException { 1348 | 1349 | createDataSocket(); 1350 | 1351 | send("CWD " + path); 1352 | String reply = receive(); 1353 | // get status code 1354 | int status = getStatus(reply); 1355 | if (status > 2) { 1356 | throw new IOException(reply); 1357 | } 1358 | 1359 | // send command to the control port 1360 | if (extended) { 1361 | send("LIST"); 1362 | } else { 1363 | send("NLST"); 1364 | } 1365 | 1366 | // read status of the command from the control port 1367 | reply = receive(); 1368 | 1369 | // get status code 1370 | status = getStatus(reply); 1371 | if (status != 1) { 1372 | throw new IOException(reply); 1373 | } 1374 | 1375 | // starting data transaction 1376 | final Socket dataSocket = getDataSocket(); 1377 | final BufferedReader dataStream = new BufferedReader(new InputStreamReader(dataSocket.getInputStream())); 1378 | 1379 | // read file system data 1380 | String line; 1381 | final ArrayList files = new ArrayList(); 1382 | try { 1383 | while ((line = dataStream.readLine()) != null) { 1384 | if (!line.startsWith("total ")) { 1385 | files.add(line); 1386 | } 1387 | } 1388 | } catch (final IOException e1) { 1389 | e1.printStackTrace(); 1390 | } finally {try { 1391 | // shutdown data connection 1392 | dataStream.close(); // Closing the returned InputStream will 1393 | closeDataSocket(); // close the associated socket. 1394 | } catch (final IOException e) { 1395 | e.printStackTrace(); 1396 | }} 1397 | // after stream is empty we should get control completion echo 1398 | reply = receive(); 1399 | //System.out.println("reply of LIST: " + reply); 1400 | // boolean success = !isNotPositiveCompletion(reply); 1401 | //for (String s: files) System.out.println("FILES of '" + path + "': " + s); 1402 | 1403 | files.trimToSize(); 1404 | return files; 1405 | } 1406 | 1407 | public boolean MDIR() { 1408 | return MKDIR(); 1409 | } 1410 | 1411 | public boolean MKDIR() { 1412 | if (this.cmd.length != 2) { 1413 | Logger.warn("Syntax: MKDIR "); 1414 | return true; 1415 | } 1416 | if (notConnected()) { 1417 | return LMKDIR(); 1418 | } 1419 | try { 1420 | // send mkdir command 1421 | send("MKD " + this.cmd[1]); 1422 | // read reply 1423 | final String reply = receive(); 1424 | if (isNotPositiveCompletion(reply)) { 1425 | throw new IOException(reply); 1426 | } 1427 | } catch (final IOException e) { 1428 | Logger.warn("Error: creation of folder " + this.cmd[1] + " failed"); 1429 | } 1430 | return true; 1431 | } 1432 | 1433 | public boolean MGET() { 1434 | if (this.cmd.length != 2) { 1435 | Logger.warn("Syntax: MGET "); 1436 | return true; 1437 | } 1438 | try { 1439 | mget(this.cmd[1], false); 1440 | } catch (final IOException e) { 1441 | Logger.warn("Error: mget failed (" + e.getMessage() + ")"); 1442 | } 1443 | return true; 1444 | } 1445 | 1446 | private void mget(final String pattern, final boolean remove) throws IOException { 1447 | final List l = list(".", false); 1448 | File local; 1449 | for (final String remote : l) { 1450 | if (matches(remote, pattern)) { 1451 | local = new File(this.currentLocalPath, remote); 1452 | if (local.exists()) { 1453 | Logger.warn("Warning: local file " + local.toString() + " overwritten."); 1454 | if(!local.delete()) 1455 | Logger.warn("Warning: local file " + local.toString() + " could not be deleted."); 1456 | } 1457 | retrieveFilesRecursively(remote, remove); 1458 | } 1459 | } 1460 | } 1461 | 1462 | public boolean MOVEDOWN() { 1463 | if (this.cmd.length != 2) { 1464 | Logger.warn("Syntax: MOVEDOWN "); 1465 | return true; 1466 | } 1467 | try { 1468 | mget(this.cmd[1], true); 1469 | } catch (final IOException e) { 1470 | Logger.warn("Error: movedown failed (" + e.getMessage() + ")"); 1471 | } 1472 | return true; 1473 | } 1474 | 1475 | /** 1476 | * public boolean MOVEUP() { } 1477 | * 1478 | * @return 1479 | */ 1480 | public boolean MV() { 1481 | if (this.cmd.length != 3) { 1482 | Logger.warn("Syntax: MV "); 1483 | return true; 1484 | } 1485 | if (notConnected()) { 1486 | return LMV(); 1487 | } 1488 | try { 1489 | // send rename commands 1490 | send("RNFR " + this.cmd[1]); 1491 | // read reply 1492 | String reply = receive(); 1493 | if (isNotPositiveCompletion(reply)) { 1494 | throw new IOException(reply); 1495 | } 1496 | send("RNTO " + this.cmd[2]); 1497 | // read reply 1498 | reply = receive(); 1499 | if (isNotPositiveCompletion(reply)) { 1500 | throw new IOException(reply); 1501 | } 1502 | } catch (final IOException e) { 1503 | Logger.warn("Error: rename of " + this.cmd[1] + " to " + this.cmd[2] + " failed."); 1504 | } 1505 | return true; 1506 | } 1507 | 1508 | public boolean NOOP() { 1509 | if (this.cmd.length != 1) { 1510 | Logger.warn("Syntax: NOOP (no parameter)"); 1511 | return true; 1512 | } 1513 | try { 1514 | literal("NOOP"); 1515 | } catch (final IOException e) { 1516 | Logger.warn("Error: server does not know how to do nothing"); 1517 | } 1518 | return true; 1519 | } 1520 | 1521 | public boolean OPEN() { 1522 | if ((this.cmd.length < 2) || (this.cmd.length > 3)) { 1523 | Logger.warn("Syntax: OPEN []"); 1524 | return true; 1525 | } 1526 | int port = 21; 1527 | if (this.cmd.length == 3) { 1528 | try { 1529 | port = java.lang.Integer.parseInt(this.cmd[2]); 1530 | } catch (final NumberFormatException e) { 1531 | port = 21; 1532 | } 1533 | } 1534 | if (this.cmd[1].indexOf(':',0) > 0) { 1535 | // port is given 1536 | port = java.lang.Integer.parseInt(this.cmd[1].substring(this.cmd[1].indexOf(':',0) + 1)); 1537 | this.cmd[1] = this.cmd[1].substring(0, this.cmd[1].indexOf(':',0)); 1538 | } 1539 | try { 1540 | open(this.cmd[1], port); 1541 | Logger.info("---- Connection to " + this.cmd[1] + " established."); 1542 | this.prompt = "ftp [" + this.cmd[1] + "]>"; 1543 | } catch (final IOException e) { 1544 | Logger.warn("Error: connecting " + this.cmd[1] + " on port " + port + " failed: " + e.getMessage()); 1545 | } 1546 | return true; 1547 | } 1548 | 1549 | public void open(final String host, final int port) throws IOException { 1550 | if (this.ControlSocket != null) { 1551 | exec("close", false); // close any existing connections first 1552 | } 1553 | 1554 | try { 1555 | this.ControlSocket = new Socket(); 1556 | this.ControlSocket.setSoTimeout(getTimeout()); 1557 | this.ControlSocket.setKeepAlive(true); 1558 | this.ControlSocket.setTcpNoDelay(true); // no accumulation until buffer is full 1559 | this.ControlSocket.setSoLinger(false, getTimeout()); // !wait for all data being written on close() 1560 | this.ControlSocket.setSendBufferSize(1440); // read http://www.cisco.com/warp/public/105/38.shtml 1561 | this.ControlSocket.setReceiveBufferSize(1440); // read http://www.cisco.com/warp/public/105/38.shtml 1562 | this.ControlSocket.connect(new InetSocketAddress(host, port), 1000); 1563 | this.clientInput = new BufferedReader(new InputStreamReader(this.ControlSocket.getInputStream())); 1564 | this.clientOutput = new DataOutputStream(new BufferedOutputStream(this.ControlSocket.getOutputStream())); 1565 | 1566 | // read and return server message 1567 | this.host = host; 1568 | this.port = port; 1569 | this.remotemessage = receive(); 1570 | if ((this.remotemessage != null) && (this.remotemessage.length() > 3)) { 1571 | this.remotemessage = this.remotemessage.substring(4); 1572 | } 1573 | } catch (final IOException e) { 1574 | // if a connection was opened, it should not be used 1575 | closeConnection(); 1576 | throw new IOException(e.getMessage()); 1577 | } 1578 | } 1579 | 1580 | /** 1581 | * @return 1582 | */ 1583 | public boolean notConnected() { 1584 | return this.ControlSocket == null; 1585 | } 1586 | 1587 | /** 1588 | * close all sockets 1589 | * 1590 | * @throws IOException 1591 | */ 1592 | private void closeConnection() throws IOException { 1593 | // cleanup 1594 | if (this.clientOutput != null) this.clientOutput.close(); 1595 | if (this.clientInput != null) this.clientInput.close(); 1596 | if (this.ControlSocket != null) this.ControlSocket.close(); 1597 | if (this.DataSocketActive != null) this.DataSocketActive.close(); 1598 | if (this.DataSocketPassive != null) this.DataSocketPassive.close(); 1599 | } 1600 | 1601 | public boolean PROMPT() { 1602 | Logger.warn("prompt is always off"); 1603 | return true; 1604 | } 1605 | 1606 | public boolean PUT() { 1607 | if ((this.cmd.length < 2) || (this.cmd.length > 3)) { 1608 | Logger.warn("Syntax: PUT []"); 1609 | return true; 1610 | } 1611 | final File local = new File(this.currentLocalPath, this.cmd[1]); 1612 | final String remote = (this.cmd.length == 2) ? local.getName() : this.cmd[2]; 1613 | if (!local.exists()) { 1614 | Logger.warn("Error: local file " + local.toString() + " does not exist."); 1615 | Logger.warn(" Remote file " + remote + " not overwritten."); 1616 | } else { 1617 | try { 1618 | put(local.getAbsolutePath(), remote); 1619 | } catch (final IOException e) { 1620 | Logger.warn("Error: transmitting file " + local.toString() + " failed."); 1621 | } 1622 | } 1623 | return true; 1624 | } 1625 | 1626 | public boolean PWD() { 1627 | if (this.cmd.length > 1) { 1628 | Logger.warn("Syntax: PWD (no parameter)"); 1629 | return true; 1630 | } 1631 | if (notConnected()) { 1632 | return LPWD(); 1633 | } 1634 | try { 1635 | Logger.info("---- Current remote path is: " + pwd()); 1636 | } catch (final IOException e) { 1637 | Logger.warn("Error: remote path not available"); 1638 | } 1639 | return true; 1640 | } 1641 | 1642 | private String pwd() throws IOException { 1643 | // send pwd command 1644 | send("PWD"); 1645 | 1646 | // read current directory 1647 | final String reply = receive(); 1648 | if (isNotPositiveCompletion(reply)) { 1649 | throw new IOException(reply); 1650 | } 1651 | 1652 | // parse directory name out of the reply 1653 | return reply.substring(5, reply.lastIndexOf('"')); 1654 | } 1655 | 1656 | public boolean REMOTEHELP() { 1657 | if (this.cmd.length != 1) { 1658 | Logger.warn("Syntax: REMOTEHELP (no parameter)"); 1659 | return true; 1660 | } 1661 | try { 1662 | literal("HELP"); 1663 | } catch (final IOException e) { 1664 | Logger.warn("Error: remote help not supported by server."); 1665 | } 1666 | return true; 1667 | } 1668 | 1669 | public boolean RMDIR() { 1670 | if (this.cmd.length != 2) { 1671 | Logger.warn("Syntax: RMDIR "); 1672 | return true; 1673 | } 1674 | if (notConnected()) { 1675 | return LRMDIR(); 1676 | } 1677 | try { 1678 | rmForced(this.cmd[1]); 1679 | } catch (final IOException e) { 1680 | Logger.warn("Error: deletion of folder " + this.cmd[1] + " failed."); 1681 | } 1682 | return true; 1683 | } 1684 | 1685 | public boolean QUIT() { 1686 | if (!notConnected()) { 1687 | exec("close", false); 1688 | } 1689 | return false; 1690 | } 1691 | 1692 | public boolean RECV() { 1693 | return GET(); 1694 | } 1695 | 1696 | /** 1697 | * size of file on ftp-server (maybe size of directory-entry is possible) 1698 | * 1699 | * @param path 1700 | * @return size in bytes or -1 if size cannot be determinied 1701 | */ 1702 | public long fileSize(final String path) { 1703 | long size = -1; 1704 | try { 1705 | // extended FTP 1706 | size = size(path); 1707 | } catch (final IOException e) { 1708 | // else with LIST-data 1709 | final entryInfo info = fileInfo(path); 1710 | if (info != null) { 1711 | size = info.size; 1712 | } 1713 | } 1714 | return size; 1715 | } 1716 | 1717 | public int size(final String path) throws IOException { 1718 | // get the size of a file. If the given path targets to a directory, a 1719 | // -1 is returned 1720 | // this function is not supported by standard rfc 959. The method is 1721 | // descibed in RFC 3659 Extensions to FTP 1722 | // if the method is not supported by the target server, this throws an 1723 | // IOException with the 1724 | // server response as exception message 1725 | 1726 | // send command to the control port 1727 | send("SIZE " + path); 1728 | 1729 | // read status of the command from the control port 1730 | final String reply = receive(); 1731 | 1732 | if (getStatusCode(reply) != 213) { 1733 | throw new IOException(reply); 1734 | } 1735 | 1736 | try { 1737 | return Integer.parseInt(reply.substring(4)); 1738 | } catch (final NumberFormatException e) { 1739 | throw new IOException(reply); 1740 | } 1741 | } 1742 | 1743 | public boolean USER() { 1744 | if (this.cmd.length != 3) { 1745 | Logger.warn("Syntax: USER "); 1746 | return true; 1747 | } 1748 | try { 1749 | login(this.cmd[1], this.cmd[2]); 1750 | Logger.info("---- Granted access for user " + this.cmd[1] + "."); 1751 | } catch (final IOException e) { 1752 | Logger.warn("Error: authorization of user " + this.cmd[1] + " failed: " + e.getMessage()); 1753 | } 1754 | return true; 1755 | } 1756 | 1757 | public boolean APPEND() { 1758 | Logger.warn("not yet supported"); 1759 | return true; 1760 | } 1761 | 1762 | public boolean HELP() { 1763 | Logger.info("---- ftp HELP ----"); 1764 | Logger.info(""); 1765 | Logger.info("This ftp client shell can act as command shell for the local host as well for the"); 1766 | Logger.info("remote host. Commands that point to the local host are preceded by 'L'."); 1767 | Logger.info(""); 1768 | Logger.info("Supported Commands:"); 1769 | Logger.info("ASCII"); 1770 | Logger.info(" switch remote server to ASCII transfer mode"); 1771 | Logger.info("BINARY"); 1772 | Logger.info(" switch remote server to BINARY transfer mode"); 1773 | Logger.info("BYE"); 1774 | Logger.info(" quit the command shell (same as EXIT)"); 1775 | Logger.info("CD "); 1776 | Logger.info(" change remote path"); 1777 | Logger.info("CLOSE"); 1778 | Logger.info(" close connection to remote host (same as DISCONNECT)"); 1779 | Logger.info("DEL "); 1780 | Logger.info(" delete file on remote server (same as RM)"); 1781 | Logger.info("RM "); 1782 | Logger.info(" remove file from remote server (same as DEL)"); 1783 | Logger.info("DIR [|] "); 1784 | Logger.info(" print file information for remote directory or file"); 1785 | Logger.info("DISCONNECT"); 1786 | Logger.info(" disconnect from remote server (same as CLOSE)"); 1787 | Logger.info("EXIT"); 1788 | Logger.info(" quit the command shell (same as BYE)"); 1789 | Logger.info("GET []"); 1790 | Logger.info(" load from remote server and store it locally,"); 1791 | Logger.info(" optionally to . if the is a directory,"); 1792 | Logger.info(" then all files in that directory are retrieved,"); 1793 | Logger.info(" including recursively all subdirectories."); 1794 | Logger.info("GLOB"); 1795 | Logger.info(" toggles globbing: matching with wild cards or not"); 1796 | Logger.info("COPY"); 1797 | Logger.info(" copies local files"); 1798 | Logger.info("LCD "); 1799 | Logger.info(" local directory change"); 1800 | Logger.info("LDEL "); 1801 | Logger.info(" local file delete"); 1802 | Logger.info("LDIR"); 1803 | Logger.info(" shows local directory content"); 1804 | Logger.info("LITERAL []"); 1805 | Logger.info(" Sends FTP commands as documented in RFC959"); 1806 | Logger.info("LLS"); 1807 | Logger.info(" as LDIR"); 1808 | Logger.info("LMD"); 1809 | Logger.info(" as LMKDIR"); 1810 | Logger.info("LMV "); 1811 | Logger.info(" copies local files"); 1812 | Logger.info("LPWD"); 1813 | Logger.info(" prints local path"); 1814 | Logger.info("LRD"); 1815 | Logger.info(" as LMKDIR"); 1816 | Logger.info("LRMD "); 1817 | Logger.info(" deletes local directory "); 1818 | Logger.info("LRM "); 1819 | Logger.info(" deletes local file "); 1820 | Logger.info("LS [|]"); 1821 | Logger.info(" prints list of remote directory or information of file "); 1822 | Logger.info("MDIR"); 1823 | Logger.info(" as MKDIR"); 1824 | Logger.info("MGET "); 1825 | Logger.info(" copies files from remote server that fits into the"); 1826 | Logger.info(" pattern to the local path."); 1827 | Logger.info("MOVEDOWN "); 1828 | Logger.info(" copies files from remote server as with MGET"); 1829 | Logger.info(" and deletes them afterwards on the remote server"); 1830 | Logger.info("MV "); 1831 | Logger.info(" moves or renames files on the local host"); 1832 | Logger.info("NOOP"); 1833 | Logger.info(" sends the NOOP command to the remote server (which does nothing)"); 1834 | Logger.info(" This command is usually used to measure the speed of the remote server."); 1835 | Logger.info("OPEN []"); 1836 | Logger.info(" connects the ftp shell to the remote server . Optionally,"); 1837 | Logger.info(" a port number can be given, the default port number is 21."); 1838 | Logger.info(" Example: OPEN localhost:2121 or OPEN 192.168.0.1 2121"); 1839 | Logger.info("PROMPT"); 1840 | Logger.info(" compatibility command, that usually toggles beween prompting on or off."); 1841 | Logger.info(" ftp has prompting switched off by default and cannot switched on."); 1842 | Logger.info("PUT []"); 1843 | Logger.info(" copies the to the remote server to the current remote path or"); 1844 | Logger.info(" optionally to the given path."); 1845 | Logger.info("PWD"); 1846 | Logger.info(" prints current path on the remote server."); 1847 | Logger.info("REMOTEHELP"); 1848 | Logger.info(" asks the remote server to print the help text of the remote server"); 1849 | Logger.info("RMDIR "); 1850 | Logger.info(" removes the directory on the remote server"); 1851 | Logger.info("QUIT"); 1852 | Logger.info(" exits the ftp application"); 1853 | Logger.info("RECV"); 1854 | Logger.info(" as GET"); 1855 | Logger.info("USER "); 1856 | Logger.info(" Loggers into the remote server with the user "); 1857 | Logger.info(" and the password "); 1858 | Logger.info(""); 1859 | Logger.info(""); 1860 | Logger.info("EXAMPLE:"); 1861 | Logger.info("a standard sessions looks like this"); 1862 | Logger.info(">open 192.168.0.1:2121"); 1863 | Logger.info(">user anonymous bob"); 1864 | Logger.info(">pwd"); 1865 | Logger.info(">ls"); 1866 | Logger.info(">....."); 1867 | Logger.info(""); 1868 | Logger.info(""); 1869 | return true; 1870 | } 1871 | 1872 | public boolean QUOTE() { 1873 | Logger.warn("not yet supported"); 1874 | return true; 1875 | } 1876 | 1877 | public boolean BELL() { 1878 | Logger.warn("not yet supported"); 1879 | return true; 1880 | } 1881 | 1882 | public boolean MDELETE() { 1883 | Logger.warn("not yet supported"); 1884 | return true; 1885 | } 1886 | 1887 | public boolean SEND() { 1888 | Logger.warn("not yet supported"); 1889 | return true; 1890 | } 1891 | 1892 | public boolean DEBUG() { 1893 | Logger.warn("not yet supported"); 1894 | return true; 1895 | } 1896 | 1897 | public boolean MLS() { 1898 | Logger.warn("not yet supported"); 1899 | return true; 1900 | } 1901 | 1902 | public boolean TRACE() { 1903 | Logger.warn("not yet supported"); 1904 | return true; 1905 | } 1906 | 1907 | public boolean MPUT() { 1908 | Logger.warn("not yet supported"); 1909 | return true; 1910 | } 1911 | 1912 | public boolean TYPE() { 1913 | Logger.warn("not yet supported"); 1914 | return true; 1915 | } 1916 | 1917 | public boolean CREATE() { 1918 | Logger.warn("not yet supported"); 1919 | return true; 1920 | } 1921 | 1922 | // helper functions 1923 | 1924 | private boolean matches(final String name, final String pattern) { 1925 | // checks whether the string name matches with the pattern 1926 | // the pattern may contain characters '*' as wildcard for several 1927 | // characters (also none) and '?' to match exactly one characters 1928 | // Logger.info("MATCH " + name + " " + pattern); 1929 | if (!this.glob) { 1930 | return name.equals(pattern); 1931 | } 1932 | if (pattern.equals("*")) { 1933 | return true; 1934 | } 1935 | if (pattern.length() > 0 && pattern.charAt(0) == '*' && pattern.endsWith("*")) { 1936 | return // avoid recursion deadlock 1937 | ((matches(name, pattern.substring(1))) || (matches(name, pattern.substring(0, pattern.length() - 1)))); 1938 | } 1939 | try { 1940 | int i = pattern.indexOf('?',0); 1941 | if (i >= 0) { 1942 | if (!(matches(name.substring(0, i), pattern.substring(0, i)))) { 1943 | return false; 1944 | } 1945 | return (matches(name.substring(i + 1), pattern.substring(i + 1))); 1946 | } 1947 | i = pattern.indexOf('*',0); 1948 | if (i >= 0) { 1949 | if (!(name.substring(0, i).equals(pattern.substring(0, i)))) { 1950 | return false; 1951 | } 1952 | if (pattern.length() == i + 1) { 1953 | return true; // pattern would be '*' 1954 | } 1955 | return (matches(reverse(name.substring(i)), reverse(pattern.substring(i + 1)) + "*")); 1956 | } 1957 | return name.equals(pattern); 1958 | } catch (final java.lang.StringIndexOutOfBoundsException e) { 1959 | // this is normal. it's a lazy implementation 1960 | return false; 1961 | } 1962 | } 1963 | 1964 | private String reverse(final String s) { 1965 | if (s.length() < 2) { 1966 | return s; 1967 | } 1968 | return reverse(s.substring(1)) + s.charAt(0); 1969 | } 1970 | 1971 | // protocoll socket commands 1972 | 1973 | private void send(final String buf) throws IOException { 1974 | if (this.clientOutput == null) return; 1975 | final byte[] b = buf.getBytes(StandardCharsets.UTF_8); 1976 | this.clientOutput.write(b, 0, b.length); 1977 | this.clientOutput.write('\r'); 1978 | this.clientOutput.write('\n'); 1979 | this.clientOutput.flush(); 1980 | if (buf.startsWith("PASS")) { 1981 | Logger.info("> PASS ********"); 1982 | } else { 1983 | Logger.info("> " + buf); 1984 | } 1985 | } 1986 | 1987 | private String receive() throws IOException { 1988 | // last reply starts with 3 digit number followed by space 1989 | String reply; 1990 | 1991 | while (true) { 1992 | if (this.clientInput == null) { 1993 | throw new IOException("Server has presumably shut down the connection."); 1994 | } 1995 | reply = this.clientInput.readLine(); 1996 | 1997 | // sanity check 1998 | if (reply == null) { 1999 | throw new IOException("Server has presumably shut down the connection."); 2000 | } 2001 | 2002 | Logger.info("< " + reply); 2003 | // serverResponse.addElement(reply); 2004 | 2005 | if (reply.length() >= 4 && Character.isDigit(reply.charAt(0)) && Character.isDigit(reply.charAt(1)) 2006 | && Character.isDigit(reply.charAt(2)) && (reply.charAt(3) == ' ')) { 2007 | break; // end of reply 2008 | } 2009 | } 2010 | // return last reply line 2011 | return reply; 2012 | } 2013 | 2014 | private void sendTransferType(final char type) throws IOException { 2015 | send("TYPE " + type); 2016 | 2017 | final String reply = receive(); 2018 | if (isNotPositiveCompletion(reply)) { 2019 | throw new IOException(reply); 2020 | } 2021 | } 2022 | 2023 | /** 2024 | * @return 2025 | * @throws IOException 2026 | */ 2027 | private Socket getDataSocket() throws IOException { 2028 | Socket data; 2029 | if (isPassive()) { 2030 | if (this.DataSocketPassive == null) { 2031 | createDataSocket(); 2032 | } 2033 | data = this.DataSocketPassive; 2034 | } else { 2035 | if (this.DataSocketActive == null) { 2036 | createDataSocket(); 2037 | } 2038 | data = this.DataSocketActive.accept(); 2039 | } 2040 | return data; 2041 | } 2042 | 2043 | /** 2044 | * create data channel 2045 | * 2046 | * @throws IOException 2047 | */ 2048 | private void createDataSocket() throws IOException { 2049 | if (isPassive()) { 2050 | try { 2051 | createPassiveDataPort(); 2052 | } catch (final IOException e) { 2053 | createActiveDataPort(); 2054 | } 2055 | } else { 2056 | try { 2057 | createActiveDataPort(); 2058 | } catch (final IOException e) { 2059 | createPassiveDataPort(); 2060 | } 2061 | } 2062 | } 2063 | 2064 | /** 2065 | * use passive ftp? 2066 | * 2067 | * @return 2068 | */ 2069 | private boolean isPassive() { 2070 | return this.DataSocketPassiveMode; 2071 | } 2072 | 2073 | private void createActiveDataPort() throws IOException { 2074 | // create data socket and bind it to free port available 2075 | this.DataSocketActive = new ServerSocket(0); 2076 | this.DataSocketActive.setSoTimeout(getTimeout()); 2077 | this.DataSocketActive.setReceiveBufferSize(1440); // read http://www.cisco.com/warp/public/105/38.shtml 2078 | applyDataSocketTimeout(); 2079 | 2080 | // get port socket has been bound to 2081 | final int DataPort = this.DataSocketActive.getLocalPort(); 2082 | 2083 | // client ip 2084 | // InetAddress LocalIp = serverCore.publicIP(); 2085 | // InetAddress LocalIp = 2086 | // DataSocketActive.getInetAddress().getLocalHost(); 2087 | 2088 | // save ip address in high byte order 2089 | // byte[] Bytes = LocalIp.getAddress(); 2090 | final byte[] b = Domains.myPublicIPv4().iterator().next().getAddress(); 2091 | 2092 | // bytes greater than 127 should not be printed as negative 2093 | final short[] s = new short[4]; 2094 | for (int i = 0; i < 4; i++) { 2095 | s[i] = b[i]; 2096 | if (s[i] < 0) { 2097 | s[i] += 256; 2098 | } 2099 | } 2100 | 2101 | // send port command via control socket: 2102 | // four ip address shorts encoded and two port shorts encoded 2103 | send("PORT " 2104 | + 2105 | // "127,0,0,1," + 2106 | s[0] + "," + s[1] + "," + s[2] + "," + s[3] + "," + ((DataPort & 0xff00) >> 8) 2107 | + "," + (DataPort & 0x00ff)); 2108 | 2109 | // read status of the command from the control port 2110 | final String reply = receive(); 2111 | 2112 | // check status code 2113 | if (isNotPositiveCompletion(reply)) { 2114 | throw new IOException(reply); 2115 | } 2116 | 2117 | this.DataSocketPassiveMode = false; 2118 | } 2119 | 2120 | private void createPassiveDataPort() throws IOException { 2121 | // send port command via control socket: 2122 | // four ip address shorts encoded and two port shorts encoded 2123 | send("PASV"); 2124 | 2125 | // read status of the command from the control port 2126 | String reply = receive(); 2127 | 2128 | // check status code 2129 | if (getStatusCode(reply) != 227) { 2130 | throw new IOException(reply); 2131 | } 2132 | 2133 | // parse the status return: address should start at the first number 2134 | int pos = 4; 2135 | while ((pos < reply.length()) && ((reply.charAt(pos) < '0') || (reply.charAt(pos) > '9'))) { 2136 | pos++; 2137 | } 2138 | if (pos >= reply.length()) { 2139 | throw new IOException(reply + " [could not parse return code]"); 2140 | } 2141 | reply = reply.substring(pos); 2142 | pos = reply.length() - 1; 2143 | while ((pos >= 0) && ((reply.charAt(pos) < '0') || (reply.charAt(pos) > '9'))) { 2144 | pos--; 2145 | } 2146 | if (pos < 0) { 2147 | throw new IOException("[could not parse return code: no numbers]"); 2148 | } 2149 | reply = reply.substring(0, pos + 1); 2150 | final StringTokenizer st = new StringTokenizer(reply, ","); 2151 | if (st.countTokens() != 6) { 2152 | throw new IOException("[could not parse return code: wrong number of numbers]"); 2153 | } 2154 | 2155 | // set the data host and port 2156 | final int a = Integer.parseInt(st.nextToken()); 2157 | final int b = Integer.parseInt(st.nextToken()); 2158 | final int c = Integer.parseInt(st.nextToken()); 2159 | final int d = Integer.parseInt(st.nextToken()); 2160 | final InetAddress datahost = Domains.dnsResolve(a + "." + b + "." + c + "." + d); 2161 | final int high = Integer.parseInt(st.nextToken()); 2162 | final int low = Integer.parseInt(st.nextToken()); 2163 | if (high < 0 || high > 255 || low < 0 || low > 255) { 2164 | throw new IOException("[could not parse return code: syntax error]"); 2165 | } 2166 | final int dataport = (high << 8) + low; 2167 | 2168 | this.DataSocketPassive = new Socket(datahost, dataport); 2169 | applyDataSocketTimeout(); 2170 | this.DataSocketPassiveMode = true; 2171 | } 2172 | 2173 | /** 2174 | * closes data connection 2175 | * 2176 | * @throws IOException 2177 | */ 2178 | private void closeDataSocket() throws IOException { 2179 | if (isPassive()) { 2180 | if (this.DataSocketPassive != null) { 2181 | this.DataSocketPassive.close(); 2182 | this.DataSocketPassive = null; 2183 | } 2184 | } else { 2185 | if (this.DataSocketActive != null) { 2186 | this.DataSocketActive.close(); 2187 | this.DataSocketActive = null; 2188 | } 2189 | } 2190 | } 2191 | 2192 | /** 2193 | * sets the timeout for the socket 2194 | * 2195 | * @throws SocketException 2196 | */ 2197 | private void applyDataSocketTimeout() throws SocketException { 2198 | if (isPassive()) { 2199 | if (this.DataSocketPassive != null) { 2200 | this.DataSocketPassive.setSoTimeout(this.DataSocketTimeout * 1000); 2201 | } 2202 | } else { 2203 | if (this.DataSocketActive != null) { 2204 | this.DataSocketActive.setSoTimeout(this.DataSocketTimeout * 1000); 2205 | } 2206 | } 2207 | } 2208 | 2209 | private void get(final String fileDest, final String fileName) throws IOException { 2210 | // store time for statistics 2211 | final long start = System.currentTimeMillis(); 2212 | 2213 | createDataSocket(); 2214 | 2215 | // set type of the transfer 2216 | sendTransferType(transferType); 2217 | 2218 | // send command to the control port 2219 | send("RETR " + fileName); 2220 | 2221 | // read status of the command from the control port 2222 | final String reply = receive(); 2223 | 2224 | // get status code 2225 | final int status = getStatus(reply); 2226 | 2227 | // starting data transaction 2228 | if (status == 1) { 2229 | Socket data = null; 2230 | InputStream ClientStream = null; 2231 | RandomAccessFile outFile = null; 2232 | int length = 0; 2233 | try { 2234 | data = getDataSocket(); 2235 | ClientStream = data.getInputStream(); 2236 | 2237 | // create local file 2238 | if (fileDest == null) { 2239 | outFile = new RandomAccessFile(fileName, "rw"); 2240 | } else { 2241 | outFile = new RandomAccessFile(fileDest, "rw"); 2242 | } 2243 | 2244 | // write remote file to local file 2245 | final byte[] block = new byte[blockSize]; 2246 | int numRead; 2247 | 2248 | while ((numRead = ClientStream.read(block)) != -1) { 2249 | outFile.write(block, 0, numRead); 2250 | length = length + numRead; 2251 | } 2252 | } finally { 2253 | // shutdown connection 2254 | if(outFile != null) { 2255 | outFile.close(); 2256 | } 2257 | if(ClientStream != null) { 2258 | ClientStream.close(); 2259 | } 2260 | closeDataSocket(); 2261 | } 2262 | 2263 | // after stream is empty we should get control completion echo 2264 | /*reply =*/ receive(); 2265 | // boolean success = !isNotPositiveCompletion(reply); 2266 | // if (!success) throw new IOException(reply); 2267 | 2268 | // write statistics 2269 | final long stop = System.currentTimeMillis(); 2270 | Logger.info(" ---- downloaded " 2271 | + ((length < 2048) ? length + " bytes" : (length / 1024) + " kbytes") 2272 | + " in " 2273 | + (((stop - start) < 2000) ? (stop - start) + " milliseconds" 2274 | : (((int) ((stop - start) / 100)) / 10) + " seconds")); 2275 | if (start == stop) { 2276 | Logger.warn("start == stop"); 2277 | } else { 2278 | Logger.info(" (" + (length * 1000 / 1024 / (stop - start)) + " kbytes/second)"); 2279 | } 2280 | 2281 | } else { 2282 | throw new IOException(reply); 2283 | } 2284 | } 2285 | 2286 | 2287 | public byte[] get(final String fileName) throws IOException { 2288 | 2289 | createDataSocket(); 2290 | 2291 | // set type of the transfer 2292 | sendTransferType(transferType); 2293 | 2294 | // send command to the control port 2295 | send("RETR " + fileName); 2296 | 2297 | // read status of the command from the control port 2298 | final String reply = receive(); 2299 | 2300 | // get status code 2301 | final int status = getStatus(reply); 2302 | 2303 | // starting data transaction 2304 | if (status == 1) { 2305 | Socket data = null; 2306 | InputStream ClientStream = null; 2307 | final ByteArrayOutputStream os = new ByteArrayOutputStream(); 2308 | int length = 0; 2309 | try { 2310 | data = getDataSocket(); 2311 | ClientStream = data.getInputStream(); 2312 | 2313 | // write remote file to local file 2314 | final byte[] block = new byte[blockSize]; 2315 | int numRead; 2316 | 2317 | while ((numRead = ClientStream.read(block)) != -1) { 2318 | os.write(block, 0, numRead); 2319 | length = length + numRead; 2320 | } 2321 | } finally { 2322 | // shutdown connection 2323 | if (ClientStream != null) { 2324 | ClientStream.close(); 2325 | } 2326 | closeDataSocket(); 2327 | } 2328 | 2329 | // after stream is empty we should get control completion echo 2330 | /*reply =*/ receive(); 2331 | // boolean success = !isNotPositiveCompletion(reply); 2332 | return os.toByteArray(); 2333 | } 2334 | throw new IOException(reply); 2335 | } 2336 | 2337 | 2338 | private void put(final String fileName, final String fileDest) throws IOException { 2339 | 2340 | createDataSocket(); 2341 | 2342 | // set type of the transfer 2343 | sendTransferType(transferType); 2344 | 2345 | // send command to the control port 2346 | if (fileDest == null) { 2347 | send("STOR " + fileName); 2348 | } else { 2349 | send("STOR " + fileDest); 2350 | } 2351 | 2352 | // read status of the command from the control port 2353 | String reply = receive(); 2354 | 2355 | // starting data transaction 2356 | if (getStatus(reply) == 1) { 2357 | final Socket data = getDataSocket(); 2358 | final OutputStream ClientStream = data.getOutputStream(); 2359 | 2360 | // read from local file 2361 | final RandomAccessFile inFile = new RandomAccessFile(fileName, "r"); 2362 | 2363 | // write remote file to local file 2364 | final byte[] block = new byte[blockSize]; 2365 | int numRead; 2366 | 2367 | while ((numRead = inFile.read(block)) >= 0) { 2368 | ClientStream.write(block, 0, numRead); 2369 | } 2370 | 2371 | // shutdown and cleanup 2372 | inFile.close(); 2373 | ClientStream.close(); 2374 | 2375 | // shutdown remote client connection 2376 | data.close(); 2377 | 2378 | // after stream is empty we should get control completion echo 2379 | reply = receive(); 2380 | final boolean success = (getStatus(reply) == 2); 2381 | 2382 | if (!success) { 2383 | throw new IOException(reply); 2384 | } 2385 | 2386 | } else { 2387 | throw new IOException(reply); 2388 | } 2389 | } 2390 | 2391 | /** 2392 | * Login to server 2393 | * 2394 | * @param account 2395 | * @param password 2396 | * @throws IOException 2397 | */ 2398 | public void login(final String account, final String password) throws IOException { 2399 | unsetLoginData(); 2400 | 2401 | // send user name 2402 | send("USER " + account); 2403 | 2404 | String reply = receive(); 2405 | switch (getStatus(reply)) { 2406 | case 2: 2407 | // User logged in, proceed. 2408 | break; 2409 | case 5:// 530 Not logged in. 2410 | case 4: 2411 | case 1:// in RFC959 an error (page 57, diagram for the Login 2412 | // sequence) 2413 | throw new IOException(reply); 2414 | default: 2415 | // send password 2416 | send("PASS " + password); 2417 | 2418 | reply = receive(); 2419 | if (isNotPositiveCompletion(reply)) { 2420 | throw new IOException(reply); 2421 | } 2422 | } 2423 | setLoginData(account, password, reply); 2424 | } 2425 | 2426 | /** 2427 | * we are authorized to use the server 2428 | * 2429 | * @return 2430 | */ 2431 | public boolean isLoggedIn() { 2432 | return (this.account != null && this.password != null && this.remotegreeting != null); 2433 | } 2434 | 2435 | /** 2436 | * remember username and password which were used to login 2437 | * 2438 | * @param account 2439 | * @param password 2440 | * @param reply 2441 | * remoteGreeting 2442 | */ 2443 | private void setLoginData(final String account, final String password, final String reply) { 2444 | this.account = account; 2445 | this.password = password; 2446 | this.remotegreeting = reply; 2447 | } 2448 | 2449 | private void unsetLoginData() { 2450 | this.account = null; 2451 | this.password = null; 2452 | this.remotegreeting = null; 2453 | } 2454 | 2455 | public void sys() throws IOException { 2456 | // send system command 2457 | send("SYST"); 2458 | 2459 | // check completion 2460 | final String systemType = receive(); 2461 | if (isNotPositiveCompletion(systemType)) { 2462 | throw new IOException(systemType); 2463 | } 2464 | 2465 | // exclude status code from reply 2466 | this.remotesystem = systemType.substring(4); 2467 | } 2468 | 2469 | private void literal(final String commandLine) throws IOException { 2470 | // send the complete line 2471 | send(commandLine); 2472 | 2473 | // read reply 2474 | final String reply = receive(); 2475 | 2476 | if (getStatus(reply) == 5) { 2477 | throw new IOException(reply); 2478 | } 2479 | } 2480 | 2481 | /** 2482 | * control socket timeout 2483 | * 2484 | * @return 2485 | */ 2486 | public int getTimeout() { 2487 | return ControlSocketTimeout; 2488 | } 2489 | 2490 | /** 2491 | * after this time the data connection is closed 2492 | * 2493 | * @param timeout 2494 | * in seconds, 0 = infinite 2495 | */ 2496 | public void setDataSocketTimeout(final int timeout) { 2497 | this.DataSocketTimeout = timeout; 2498 | 2499 | try { 2500 | applyDataSocketTimeout(); 2501 | } catch (final SocketException e) { 2502 | Logger.warn("setDataSocketTimeout: " + e.getMessage()); 2503 | } 2504 | } 2505 | 2506 | public static List dir(final String host, final String remotePath, final String account, 2507 | final String password, final boolean extended) { 2508 | try { 2509 | final FTPClient c = new FTPClient(); 2510 | c.cmd = new String[] { "open", host }; 2511 | c.OPEN(); 2512 | c.cmd = new String[] { "user", account, password }; 2513 | c.USER(); 2514 | c.cmd = new String[] { "ls" }; 2515 | final List v = c.list(remotePath, extended); 2516 | c.cmd = new String[] { "close" }; 2517 | c.CLOSE(); 2518 | c.cmd = new String[] { "exit" }; 2519 | c.EXIT(); 2520 | return v; 2521 | } catch (final RuntimeException e) { 2522 | return null; 2523 | } catch (final IOException e) { 2524 | return null; 2525 | } 2526 | } 2527 | 2528 | private static void dir(final String host, final String remotePath, final String account, final String password) { 2529 | try { 2530 | final FTPClient c = new FTPClient(); 2531 | c.exec("open " + host, false); 2532 | c.exec("user " + account + " " + password, false); 2533 | c.exec("cd " + remotePath, false); 2534 | c.exec("ls", true); 2535 | c.exec("close", false); 2536 | c.exec("exit", false); 2537 | } catch (final RuntimeException e) { 2538 | } 2539 | } 2540 | 2541 | /** 2542 | * Asynchronously generate a list of all files on a ftp server using the anonymous account. 2543 | * @param host host name or address 2544 | * @param port ftp port 2545 | * @param user user name 2546 | * @param pw user password 2547 | * @param path path on the ftp site 2548 | * @param depth the maximum depth of the sub folders exploration. 2549 | * @return a queue asynchronously filled with entryInfo from all files of the ftp server 2550 | * @throws IOException when a error occurred 2551 | */ 2552 | public static BlockingQueue sitelist(final String host, final int port, final String user, final String pw, final String path, final int depth) throws IOException { 2553 | final FTPClient ftpClient = new FTPClient(); 2554 | ftpClient.open(host, port); 2555 | ftpClient.login(user, pw); 2556 | final LinkedBlockingQueue queue = new LinkedBlockingQueue(); 2557 | new Thread() { 2558 | @Override 2559 | public void run() { 2560 | try { 2561 | Thread.currentThread().setName("FTP.sitelist(" + host + ":" + port + ")"); 2562 | sitelist(ftpClient, path, queue, depth); 2563 | ftpClient.quit(); 2564 | } catch (final Exception e) {} finally { 2565 | queue.add(POISON_entryInfo); 2566 | } 2567 | } 2568 | }.start(); 2569 | return queue; 2570 | } 2571 | 2572 | /** 2573 | * Feed the queue with files under a given path on a ftp server using 2574 | * the anonymous account. When path is a file path, only one entry is added 2575 | * to the queue. 2576 | * 2577 | * @param ftpClient 2578 | * fptClient initialized with a host and login information 2579 | * @param path 2580 | * path on the host 2581 | * @param queue 2582 | * the entries queue to feed 2583 | * @param depth 2584 | * the maximum depth of the sub folders exploration. 2585 | * @throws IOException 2586 | * when a error occurred 2587 | */ 2588 | private static void sitelist(final FTPClient ftpClient, String path, final LinkedBlockingQueue queue, final int depth) { 2589 | List list; 2590 | try { 2591 | list = ftpClient.list(path, true); 2592 | } catch (final IOException e) { 2593 | /* path might be a file path */ 2594 | if (!path.endsWith("/")) { 2595 | entryInfo info = ftpClient.fileInfo(path); 2596 | if (info != null) { 2597 | queue.add(info); 2598 | } else { 2599 | /* We could not get file information, but this doesn't mean the file does not exist : 2600 | * we add it anyway to the queue */ 2601 | info = new entryInfo(); 2602 | info.name = path; 2603 | queue.add(info); 2604 | } 2605 | } else { 2606 | Logger.warn("cannot make sitelist", e); 2607 | } 2608 | return; 2609 | } 2610 | if (!path.endsWith("/")) path += "/"; 2611 | entryInfo info; 2612 | // first find all files and add them to the crawl list 2613 | for (final String line : list) { 2614 | info = parseListData(line); 2615 | if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) { 2616 | if (!info.name.startsWith("/")) info.name = path + info.name; 2617 | queue.add(info); 2618 | } 2619 | } 2620 | // then find all directories and add them recursively if depth is over zero 2621 | if(depth > 0) { 2622 | for (final String line : list) { 2623 | //System.out.println("LIST:" + line); 2624 | info = parseListData(line); 2625 | if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) { 2626 | if (info.type == filetype.directory) { 2627 | sitelist(ftpClient, path + info.name, queue, depth - 1); 2628 | } else if (info.type == filetype.link) { 2629 | final int q = info.name.indexOf("->",0); 2630 | if (q >= 0 && info.name.indexOf("..", q) < 0) { 2631 | //System.out.println("*** LINK:" + line); 2632 | info.name = info.name.substring(0, q).trim(); 2633 | sitelist(ftpClient, path + info.name, queue, depth - 1); 2634 | } 2635 | 2636 | } 2637 | } 2638 | } 2639 | } 2640 | } 2641 | 2642 | public StringBuilder dirhtml(String remotePath) throws IOException { 2643 | // returns a directory listing using an existing connection 2644 | if (isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) { 2645 | remotePath += '/'; 2646 | } 2647 | final String pwd = pwd(); 2648 | final List list = list(remotePath, true); 2649 | if (this.remotesystem == null) try {sys();} catch (final IOException e) {} 2650 | final String base = "ftp://" + ((this.account.equals(ANONYMOUS)) ? "" : (this.account + ":" + this.password + "@")) 2651 | + this.host + ((this.port == 21) ? "" : (":" + this.port)) + ((remotePath.length() > 0 && remotePath.charAt(0) == '/') ? "" : pwd + "/") 2652 | + remotePath; 2653 | 2654 | return dirhtml(base, this.remotemessage, this.remotegreeting, this.remotesystem, list, true); 2655 | } 2656 | 2657 | private static StringBuilder dirhtml( 2658 | final String host, final int port, final String remotePath, 2659 | final String account, final String password) throws IOException { 2660 | // opens a new connection and returns a directory listing as html 2661 | final FTPClient c = new FTPClient(); 2662 | c.open(host, port); 2663 | c.login(account, password); 2664 | c.sys(); 2665 | final StringBuilder page = c.dirhtml(remotePath); 2666 | c.quit(); 2667 | return page; 2668 | } 2669 | 2670 | public static StringBuilder dirhtml( 2671 | final String base, final String servermessage, final String greeting, 2672 | final String system, final List list, 2673 | final boolean metaRobotNoindex) { 2674 | // this creates the html output from collected strings 2675 | final StringBuilder page = new StringBuilder(1024); 2676 | final String title = "Index of " + base; 2677 | 2678 | page.append("\n"); 2679 | page.append("\n"); 2680 | page.append(" ").append(title).append("\n"); 2681 | page.append(" \n"); 2682 | if (metaRobotNoindex) { 2683 | page.append(" \n"); 2684 | } 2685 | page.append(" \n"); 2686 | page.append("\n"); 2687 | page.append("

").append(title).append("

\n"); 2688 | if (servermessage != null && greeting != null) { 2689 | page.append("

Server \"").append(servermessage).append("\" responded:\n");
2690 |             page.append("  \n");
2691 |             page.append(greeting);
2692 |             page.append("\n");
2693 |             page.append("  

\n"); 2694 | } 2695 | page.append("
\n"); 2696 | page.append("
\n");
2697 |         int nameStart, nameEnd;
2698 |         entryInfo info;
2699 |         for (final String line : list) {
2700 |             info = parseListData(line);
2701 |             if (info != null) {
2702 |                 // with link
2703 |                 nameStart = line.indexOf(info.name);
2704 |                 page.append(line.substring(0, nameStart));
2705 |                 page.append("").append(info.name).append("");
2706 |                 nameEnd = nameStart + info.name.length();
2707 |                 if (line.length() > nameEnd) {
2708 |                     page.append(line.substring(nameEnd));
2709 |                 }
2710 |             } else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://") || line.startsWith("file://")) {
2711 |                 page.append("").append(line).append("");
2712 |             } else {
2713 |                // raw
2714 |                page.append(line);
2715 |             }
2716 |             page.append('\n');
2717 |         }
2718 |         page.append("  
\n"); 2719 | page.append("
\n"); 2720 | if (system != null) page.append("
System info: \"").append(system).append("\"
\n"); 2721 | page.append("\n"); 2722 | 2723 | return page; 2724 | } 2725 | 2726 | public static String put(final String host, File localFile, String remotePath, final String remoteName, 2727 | final String account, final String password) throws IOException { 2728 | // returns the log 2729 | try { 2730 | final ByteArrayOutputStream bout = new ByteArrayOutputStream(); 2731 | final PrintStream out = new PrintStream(bout); 2732 | 2733 | final ByteArrayOutputStream berr = new ByteArrayOutputStream(); 2734 | final PrintStream err = new PrintStream(berr); 2735 | 2736 | final FTPClient c = new FTPClient(); 2737 | c.exec("open " + host, false); 2738 | c.exec("user " + account + " " + password, false); 2739 | if (remotePath != null) { 2740 | remotePath = remotePath.replace('\\', '/'); 2741 | c.exec("cd " + remotePath, false); 2742 | } 2743 | c.exec("binary", false); 2744 | if (localFile.isAbsolute()) { 2745 | c.exec("lcd \"" + localFile.getParent() + "\"", false); 2746 | localFile = new File(localFile.getName()); 2747 | } 2748 | c.exec("put " + localFile.toString() + ((remoteName.isEmpty()) ? "" : (" " + remoteName)), false); 2749 | c.exec("close", false); 2750 | c.exec("exit", false); 2751 | 2752 | out.close(); 2753 | err.close(); 2754 | 2755 | final String outLog = bout.toString(); 2756 | bout.close(); 2757 | 2758 | final String errLog = berr.toString(); 2759 | berr.close(); 2760 | 2761 | if (errLog.length() > 0) { 2762 | throw new IOException("Ftp put failed:\n" + errLog); 2763 | } 2764 | 2765 | return outLog; 2766 | } catch (final IOException e) { 2767 | throw e; 2768 | } 2769 | } 2770 | 2771 | public static void get(final String host, String remoteFile, final File localPath, final String account, final String password) { 2772 | try { 2773 | final FTPClient c = new FTPClient(); 2774 | if (remoteFile.isEmpty()) { 2775 | remoteFile = "/"; 2776 | } 2777 | c.exec("open " + host, false); 2778 | c.exec("user " + account + " " + password, false); 2779 | c.exec("lcd " + localPath.getAbsolutePath(), false); 2780 | c.exec("binary", false); 2781 | c.exec("get " + remoteFile + " " + localPath.getAbsoluteFile().toString(), false); 2782 | c.exec("close", false); 2783 | c.exec("exit", false); 2784 | } catch (final RuntimeException e) { 2785 | } 2786 | } 2787 | 2788 | public static void getAnonymous(final String host, final String remoteFile, final File localPath) { 2789 | get(host, remoteFile, localPath, ANONYMOUS, "anomic"); 2790 | } 2791 | 2792 | /** 2793 | * class that puts a file on a ftp-server can be used as a thread 2794 | */ 2795 | static class pt implements Runnable { 2796 | String host; 2797 | File localFile; 2798 | String remotePath; 2799 | String remoteName; 2800 | String account; 2801 | String password; 2802 | 2803 | public pt(final String h, final File l, final String rp, final String rn, final String a, final String p) { 2804 | this.host = h; 2805 | this.localFile = l; 2806 | this.remotePath = rp; 2807 | this.remoteName = rn; 2808 | this.account = a; 2809 | this.password = p; 2810 | } 2811 | 2812 | @Override 2813 | public final void run() { 2814 | try { 2815 | Thread.currentThread().setName("FTP.pt(" + this.host + ")"); 2816 | put(this.host, this.localFile, this.remotePath, this.remoteName, this.account, this.password); 2817 | } catch (final IOException e) { 2818 | Logger.warn(e.getMessage(), e); 2819 | } 2820 | } 2821 | } 2822 | 2823 | public static Thread putAsync(final String host, final File localFile, final String remotePath, 2824 | final String remoteName, final String account, final String password) { 2825 | final Thread t = new Thread(new pt(host, localFile, remotePath, remoteName, account, password), "ftp to " + host); 2826 | t.start(); 2827 | return t; // return value can be used to determine status of transfer 2828 | // with isAlive() or join() 2829 | } 2830 | 2831 | private static void printHelp() { 2832 | System.out.println("FTPClient help"); 2833 | System.out.println("----------"); 2834 | System.out.println(); 2835 | System.out.println("The following commands are supported"); 2836 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -h -- prints this help"); 2837 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -dir [':'] [ ]"); 2838 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -htmldir "); 2839 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -get [':'] [ ]"); 2840 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -put [':'] "); 2841 | System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -sitelist "); 2842 | System.out.println(); 2843 | } 2844 | 2845 | public static void main(final String[] args) { 2846 | try { 2847 | System.out.println("WELCOME TO THE ANOMIC FTP CLIENT v" + vDATE); 2848 | System.out.println("Visit http://www.anomic.de and support shareware!"); 2849 | System.out.println("try -h for command line options"); 2850 | System.out.println(); 2851 | if (args.length == 1) { 2852 | if (args[0].equals("-h")) { 2853 | printHelp(); 2854 | } 2855 | } else if (args.length == 2) { 2856 | printHelp(); 2857 | } else if (args.length == 3) { 2858 | if (args[0].equals("-dir")) { 2859 | dir(args[1], args[2], ANONYMOUS, "anomic@"); 2860 | } else if (args[0].equals("-htmldir")) { 2861 | final File file = new File("dirindex.html"); 2862 | try (FileOutputStream fos = new FileOutputStream(file);) { 2863 | final StringBuilder page = dirhtml(args[1], 21, args[2], ANONYMOUS, "anomic@"); 2864 | fos.write(page.toString().getBytes(StandardCharsets.UTF_8)); 2865 | } catch (final FileNotFoundException e) { 2866 | Logger.warn("", e); 2867 | } catch (final IOException e) { 2868 | Logger.warn("", e); 2869 | } 2870 | } else { 2871 | printHelp(); 2872 | } 2873 | } else if (args.length == 4) { 2874 | if (args[0].equals("-get")) { 2875 | getAnonymous(args[1], args[2], new File(args[3])); 2876 | } else if (args[0].equals("-sitelist")) { 2877 | try { 2878 | final BlockingQueue q = sitelist(args[1], Integer.parseInt(args[2]), ANONYMOUS, "anomic", "/", Integer.parseInt(args[3])); 2879 | entryInfo entry; 2880 | while ((entry = q.take()) != FTPClient.POISON_entryInfo) { 2881 | System.out.println(entry.toString()); 2882 | } 2883 | } catch (final FileNotFoundException e) { 2884 | Logger.warn("", e); 2885 | } catch (final IOException e) { 2886 | Logger.warn("", e); 2887 | } catch (final InterruptedException e) { 2888 | Logger.warn("", e); 2889 | } 2890 | } else { 2891 | printHelp(); 2892 | } 2893 | } else if (args.length == 5) { 2894 | if (args[0].equals("-dir")) { 2895 | dir(args[1], args[2], args[3], args[4]); 2896 | } else { 2897 | printHelp(); 2898 | } 2899 | } else if (args.length == 6) { 2900 | if (args[0].equals("-get")) { 2901 | get(args[1], args[2], new File(args[3]), args[4], args[5]); 2902 | } else if (args[0].equals("-put")) { 2903 | try { 2904 | put(args[1], new File(args[2]), args[3], "", args[4], args[5]); 2905 | } catch (final IOException e) { 2906 | Logger.warn(e.getMessage(), e); 2907 | } 2908 | } else { 2909 | printHelp(); 2910 | } 2911 | } else { 2912 | printHelp(); 2913 | } 2914 | } catch (final Exception e) { 2915 | 2916 | } 2917 | } 2918 | } 2919 | --------------------------------------------------------------------------------