├── .gitignore
├── README.md
├── config
    ├── ivy.xml
    ├── nutch-site.xml
    └── regex-urlfilter.txt
├── docker-compose.yml
└── nutch
    ├── Dockerfile
    └── startup.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | /crawldata
2 | /data
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apache Nutch, Elasticsearch, MongoDB
 2 | This repo contains 1) a Dockerfile build for Apache Nutch and 2) a docker-compose Setup for the usage with Elasticsearch and MongoDB.
 3 | 
 4 | Info: Currently MongoDB is not attached and used.
 5 | 
 6 | ## Apache Nutch Docker Build
 7 | The [Dockerfile](./nutch/Dockerfile) provides a Docker Build of Apache Nutch published as [smartive/nutch](https://hub.docker.com/r/smartive/nutch/).
 8 | There are two published builds:
 9 | - `latest` contains [Apache Nutch v1.13](https://github.com/apache/nutch/tree/release-1.13) for Elasticsearch 2.3.*
10 | - `es-5` contains a [modified version of Apache Nutch v1.13](https://github.com/smartive/nutch/tree/feature/es-5) ready for Elasticsearch 5.4.*
11 | 
12 | ## Apache Nutch docker-compose Setup for Elasticsearch 2.3.* and 5.4.* and MongoDB
13 | 
14 | [This repo nutch-elasticsearch-mongodb](https://github.com/smartive/docker-nutch-elasticsearch-mongodb) contains a [docker-compose](https://github.com/smartive/docker-nutch-elasticsearch-mongodb/blob/master/docker-compose.yml) configuration for Apache Nutch with Elasticsearch 2.3.* / 5.4.* and MongoDB.
15 | 
16 | To get started checkout the [Repo](https://github.com/smartive/docker-nutch-elasticsearch-mongodb) and run:
17 |  
18 | ```bash
19 | git clone git@github.com:smartive/docker-nutch-elasticsearch-mongodb.git
20 | cd ./docker-nutch-elasticsearch-mongodb && docker-compose up
21 | ```
22 | 
23 | This will fire up the nutchserver and webapp. Visit [http://localhost:8080/](http://localhost:8080/).
24 | 
25 | ### Manual Run 
26 | 
27 | ```bash
28 | docker-compose run -p 8080:8080 -p 8081:8081 --name=manual_nutch --rm --entrypoint=bash nutch
29 | ```
30 | 
31 | Then inside the docker box create the seed file:
32 | ```
33 | echo "https://smartive.ch/" > seed.txt
34 | ```
35 | 
36 | Then open `regex-urlfilter.txt` and replace the last line to limit the crawl to the domain `smartive.ch`:
37 | ```bash
38 | vi nutch/conf/regex-urlfilter.txt
39 | # Inside regex-urlfilter.txt replace the last line `+.` with:
40 | +^https://smartive\.ch
41 | ```
42 | 
43 | Then start the crawl
44 | ```bash
45 | nutch/bin/crawl -i -s seed.txt crawldata 2
46 | ```
47 | 
48 | ES index only from existing crawl database:
49 | ```
50 | /root/nutch/bin/nutch index crawldata/crawldb -linkdb crawldata/linkdb crawldata/segments/20170706210640
51 | ```
52 | 
53 | # Credits
54 | This Dockerfile and docker-compose Setup is partly based on [tpickett/mongo-elasticsearch-nutch](https://github.com/tpickett/mongo-elasticsearch-nutch).
55 | 
56 | [Apache Nutch](http://nutch.apache.org/) is a highly extensible and scalable open source web crawler software project. A well matured, production ready crawler.
57 | 
58 | 


--------------------------------------------------------------------------------
/config/ivy.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" ?>
  2 | 
  3 | <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
  4 | 	license agreements. See the NOTICE file distributed with this work for additional 
  5 | 	information regarding copyright ownership. The ASF licenses this file to 
  6 | 	You under the Apache License, Version 2.0 (the "License"); you may not use 
  7 | 	this file except in compliance with the License. You may obtain a copy of 
  8 | 	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
  9 | 	by applicable law or agreed to in writing, software distributed under the 
 10 | 	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
 11 | 	OF ANY KIND, either express or implied. See the License for the specific 
 12 | 	language governing permissions and limitations under the License. -->
 13 | 
 14 | <ivy-module version="1.0" xmlns:maven="http://ant.apache.org/ivy/maven">
 15 | 	<info organisation="org.apache.nutch" module="nutch">
 16 | 		<license name="Apache 2.0"
 17 | 			url="http://www.apache.org/licenses/LICENSE-2.0.txt/" />
 18 | 		<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org" />
 19 | 		<description homepage="http://nutch.apache.org">Nutch is an open source web-search
 20 | 			software. It builds on
 21 | 			Hadoop, Tika and Solr, adding web-specifics,
 22 | 			such as a crawler, a link-graph
 23 | 			database etc.
 24 | 		</description>
 25 | 	</info>
 26 | 	
 27 | 	<configurations>
 28 | 		<include file="${basedir}/ivy/ivy-configurations.xml" />
 29 | 	</configurations>
 30 | 	
 31 | 	<publications>
 32 | 		<!--get the artifact from our module name -->
 33 | 		<artifact conf="master" />
 34 | 	</publications>
 35 | 	
 36 | 	<dependencies>
 37 |     <dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1" conf="*->default" />
 38 | 		
 39 |     <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" conf="*->master" />
 40 | 		<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" conf="*->master" />
 41 | 		
 42 | 		<!--dependency org="log4j" name="log4j" rev="1.2.15" conf="*->default">
 43 | 			<exclude org="javax.jms" name="jms" />
 44 | 			<exclude org="com.sun.jdmk" name="jmxtools" />
 45 | 			<exclude org="com.sun.jmx" name="jmxri" />
 46 | 		</dependency-->
 47 | 		
 48 | 		<dependency org="commons-lang" name="commons-lang" rev="2.6" conf="*->default" />
 49 | 		<dependency org="commons-collections" name="commons-collections" rev="3.2.1" conf="*->master" />
 50 | 		<dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master" />
 51 | 		<dependency org="commons-codec" name="commons-codec" rev="1.10" conf="*->default" />
 52 |         <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" />
 53 |         <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 54 |         <dependency org="com.tdunning" name="t-digest" rev="3.1" />
 55 |             
 56 |         <!-- Hadoop Dependencies -->
 57 | 		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.2" conf="*->default">
 58 | 			<exclude org="hsqldb" name="hsqldb" />
 59 | 			<exclude org="net.sf.kosmosfs" name="kfs" />
 60 | 			<exclude org="net.java.dev.jets3t" name="jets3t" />
 61 | 			<exclude org="org.eclipse.jdt" name="core" />
 62 | 			<exclude org="org.mortbay.jetty" name="jsp-*" />
 63 | 			<exclude org="ant" name="ant" />
 64 | 		</dependency>
 65 |         <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.2" conf="*->default"/>
 66 |         <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.2" conf="*->default"/>
 67 |         <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.2" conf="*->default"/>
 68 |         <!-- End of Hadoop Dependencies -->
 69 | 
 70 | 		<dependency org="org.apache.tika" name="tika-core" rev="1.12" />
 71 | 		<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
 72 | 
 73 | 		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
 74 | 		<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
 75 | 		<dependency org="oro" name="oro" rev="2.0.8" />
 76 | 
 77 | 		<dependency org="com.google.guava" name="guava" rev="18.0" />
 78 | 
 79 | 		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.6" />
 80 | 
 81 | 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
 82 | 		
 83 |         <!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
 84 |         <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4" conf="*->default"/>
 85 |         <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4" conf="*->default"/>
 86 |         <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.0.4" conf="*->default"/>
 87 |         <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.0.4" conf="*->default"/>
 88 |         <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.0.4" conf="test->default"/>
 89 |         <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1"  conf="*->default"/> 
 90 |         <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
 91 |         <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>	
 92 |         
 93 | 		<!-- WARC artifacts needed  -->
 94 | 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
 95 | 			<exclude module="hadoop-core"/>
 96 | 			<exclude org="com.google.guava"/>
 97 | 			<exclude org="junit"/>
 98 | 		</dependency>
 99 | 
100 | 		<!--artifacts needed for testing -->
101 | 		<dependency org="junit" name="junit" rev="4.11" conf="test->default" />
102 | 		<dependency org="org.apache.mrunit" name="mrunit" rev="1.1.0" conf="test->default">
103 | 			<artifact name="mrunit" maven:classifier="hadoop2" />
104 | 			<exclude org="log4j" module="log4j" />
105 | 		</dependency>
106 | 		<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
107 | 		<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
108 | 		<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
109 | 		<dependency org="tomcat" name="jasper-runtime" rev="5.5.23" conf="test->default" />
110 | 		<dependency org="tomcat" name="jasper-compiler" rev="5.5.23" conf="test->default">
111 | 			<exclude org="ant" name="ant" />
112 | 		</dependency>
113 | 		<!-- end of test artifacts -->
114 | 
115 | 		<!-- web app dependencies -->
116 | 
117 |     	<dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
118 |     	<dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
119 |     	<dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
120 |     	<dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
121 | 
122 |     	<dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
123 | 	
124 |     	<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
125 |     	<dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
126 |     	<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
127 | 	
128 |     	<dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
129 |     	<dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
130 |     	<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
131 |     	<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default" />
132 | 		
133 | 		<!-- RabbitMQ dependencies -->
134 | 		<dependency org="com.rabbitmq" name="amqp-client" rev="3.6.5" conf="*->default" />
135 | 
136 | 		<!--global exclusion -->
137 | 		<exclude module="jmxtools" />
138 | 		<exclude module="jms" />
139 | 		<exclude module="jmxri" />
140 | 		<exclude org="com.thoughtworks.xstream"/>
141 | 
142 | 	</dependencies>
143 | 
144 | </ivy-module>


--------------------------------------------------------------------------------
/config/nutch-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | 
  4 | <!-- Put site-specific property overrides in this file. -->
  5 | 
  6 | <configuration>
  7 |     <property>
  8 |         <name>parser.character.encoding.default</name>
  9 |         <value>utf-8</value>
 10 |     </property>
 11 |     <property>
 12 |         <name>storage.data.store.class</name>
 13 |         <value>org.apache.gora.mongodb.store.MongoStore</value>
 14 |         <description>Default class for storing data</description>
 15 |     </property>
 16 | 
 17 |     <property>
 18 |         <name>http.agent.name</name>
 19 |         <value>Test Crawler</value>
 20 |     </property>
 21 | 
 22 |     <property>
 23 |         <name>http.content.limit</name>
 24 |         <value>-1</value>
 25 |     </property>
 26 | 
 27 |     <property>
 28 |         <name>plugin.includes</name>
 29 |         <value>protocol-selenium|urlfilter-regex|index-(basic|anchor|metadata)|query-(basic|site|url|lang)|indexer-elastic-rest|parse-(text|html|tika|metatags)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
 30 |     </property>
 31 |     <!--
 32 |     <property>
 33 |         <name>plugin.includes</name>
 34 |         <value>
 35 |             <value>protocol-http|protocol-httpclient|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata)|indexer-elastic-rest|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
 36 |         </value>
 37 |     </property>
 38 |     -->
 39 | 
 40 |     <!-- Tika Plugin properties -->
 41 |     <!--
 42 |     <property>
 43 |       <name>tika.htmlmapper.classname</name>
 44 |       <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
 45 |       <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence
 46 |       the behavior of the HTMLParseFilters.
 47 |       </description>
 48 |     </property>
 49 |     -->
 50 |     <property>
 51 |         <name>tika.uppercase.element.names</name>
 52 |         <value>true</value>
 53 |         <description>Determines whether TikaParser should uppercase the element name while generating the DOM
 54 |             for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592).
 55 |         </description>
 56 |     </property>
 57 |     <property>
 58 |         <name>tika.extractor</name>
 59 |         <value>boilerpipe</value>
 60 |         <description>
 61 |             Which text extraction algorithm to use. Valid values are: boilerpipe or none.
 62 |         </description>
 63 |     </property>
 64 |     <property>
 65 |         <name>tika.extractor.boilerpipe.algorithm</name>
 66 |         <value>ArticleExtractor</value>
 67 |         <description>
 68 |             Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor
 69 |             or CanolaExtractor.
 70 |         </description>
 71 |     </property>
 72 | 
 73 |     <!-- Elasticsearch properties -->
 74 | 
 75 |     <property>
 76 |         <name>elastic.host</name>
 77 |         <value>elasticsearch</value>
 78 |         <description>Comma-separated list of hostnames to send documents to using
 79 |             TransportClient. Either host and port must be defined or cluster.
 80 |         </description>
 81 |     </property>
 82 | 
 83 |     <property>
 84 |         <name>elastic.port</name>
 85 |         <value>9300</value>
 86 |         <description>The port to connect to using TransportClient.</description>
 87 |     </property>
 88 | 
 89 |     <property>
 90 |         <name>elastic.cluster</name>
 91 |         <value>elasticsearch</value>
 92 |         <description>The cluster name to discover. Either host and port must be defined
 93 |             or cluster.
 94 |         </description>
 95 |     </property>
 96 | 
 97 |     <property>
 98 |         <name>elastic.index</name>
 99 |         <value>nutch_test</value>
100 |         <description>Default index to send documents to.</description>
101 |     </property>
102 | 
103 |     <property>
104 |         <name>elastic.max.bulk.docs</name>
105 |         <value>250</value>
106 |         <description>Maximum size of the bulk in number of documents.</description>
107 |     </property>
108 | 
109 |     <property>
110 |         <name>elastic.max.bulk.size</name>
111 |         <value>2500500</value>
112 |         <description>Maximum size of the bulk in bytes.</description>
113 |     </property>
114 | 
115 |     <property>
116 |         <name>elastic.exponential.backoff.millis</name>
117 |         <value>100</value>
118 |         <description>Initial delay for the BulkProcessor's exponential backoff policy.
119 |         </description>
120 |     </property>
121 | 
122 |     <property>
123 |         <name>elastic.exponential.backoff.retries</name>
124 |         <value>10</value>
125 |         <description>Number of times the BulkProcessor's exponential backoff policy
126 |             should retry bulk operations.
127 |         </description>
128 |     </property>
129 | 
130 |     <property>
131 |         <name>elastic.bulk.close.timeout</name>
132 |         <value>600</value>
133 |         <description>Number of seconds allowed for the BulkProcessor to complete its
134 |             last operation.
135 |         </description>
136 |     </property>
137 | 
138 |     <!--elasticsearch rest properties-->
139 |     <property>
140 |         <name>elastic.rest.host</name>
141 |         <value>elasticsearch</value>
142 |         <description>The hostname to send documents to using Elasticsearch Jest. Both host
143 |             and port must be defined
144 |         </description>
145 |     </property>
146 | 
147 |     <property>
148 |         <name>elastic.rest.port</name>
149 |         <value>9200</value>
150 |         <description>The port to connect to using Elasticsearch Jest.</description>
151 |     </property>
152 | 
153 |     <property>
154 |         <name>elastic.rest.index</name>
155 |         <value>nutch_rest</value>
156 |         <description>Default index to send documents to.</description>
157 |     </property>
158 | 
159 |     <property>
160 |         <name>elastic.rest.type</name>
161 |         <value>doc</value>
162 |         <description>Default type to send documents to.</description>
163 |     </property>
164 | 
165 |     <property>
166 |         <name>elastic.rest.max.bulk.docs</name>
167 |         <value>250</value>
168 |         <description>Maximum size of the bulk in number of documents.</description>
169 |     </property>
170 | 
171 |     <property>
172 |         <name>elastic.rest.max.bulk.size</name>
173 |         <value>26214400</value>
174 |         <description>Maximum size of the bulk in bytes.</description>
175 |     </property>
176 | 
177 |     <property>
178 |         <name>elastic.rest.https</name>
179 |         <value>false</value>
180 |         <description>
181 |             "true" to enable https, "false" to disable https
182 |             If you've disabled http access (by forcing https), be sure to
183 |             set this to true, otherwise you might get "connection reset by peer".
184 |         </description>
185 |     </property>
186 | 
187 |     <property>
188 |         <name>elastic.rest.user</name>
189 |         <value/>
190 |         <description>Username for auth credentials (only used when https is enabled)</description>
191 |     </property>
192 | 
193 |     <property>
194 |         <name>elastic.rest.password</name>
195 |         <value/>
196 |         <description>Password for auth credentials (only used when https is enabled)</description>
197 |     </property>
198 | 
199 |     <property>
200 |         <name>elastic.rest.trustallhostnames</name>
201 |         <value>false</value>
202 |         <description>
203 |             "true" to trust elasticsearch server's certificate even if its listed domain name does not
204 |             match the domain they are hosted on
205 |             "false" to check if the elasticsearch server's certificate's listed domain is the same domain
206 |             that it is hosted on, and if it doesn't, then fail to index
207 |             (only used when https is enabled)
208 |         </description>
209 |     </property>
210 | 
211 |     <!-- Used only if plugin parse-metatags is enabled. -->
212 |     <!-- see: https://wiki.apache.org/nutch/IndexMetatags -->
213 |     <property>
214 |         <name>metatags.names</name>
215 |         <value>author,description,keywords,image,</value>
216 |         <description>Names of the metatags to extract, separated by ','.
217 |             Use '*' to extract all metatags. Prefixes the names with 'metatag.'
218 |             in the parse-metadata. For instance to index description and keywords,
219 |             you need to activate the plugin index-metadata and set the value of the
220 |             parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
221 |         </description>
222 |     </property>
223 |     <property>
224 |         <name>index.parse.md</name>
225 |         <value>metatag.description,metatag.keywords,metatag.author,metatag.image</value>
226 |         <description>
227 |             Comma-separated list of keys to be taken from the parse metadata to generate fields.
228 |             Can be used e.g. for 'description' or 'keywords' provided that these values are generated
229 |             by a parser (see parse-metatags plugin)
230 |         </description>
231 |     </property>
232 |     <property>
233 |         <name>index.content.md</name>
234 |         <value/>
235 |         <description>
236 |             Comma-separated list of keys to be taken from the content metadata to generate fields.
237 |         </description>
238 |     </property>
239 |     <property>
240 |         <name>index.db.md</name>
241 |         <value/>
242 |         <description>
243 |             Comma-separated list of keys to be taken from the crawldb metadata to generate fields.
244 |             Can be used to index values propagated from the seeds with the plugin urlmeta
245 |         </description>
246 |     </property>
247 | 
248 |     <property>
249 |         <name>selenium.driver</name>
250 |         <value>phantomjs</value>
251 |         <description>
252 |             A String value representing the flavour of Selenium
253 |             WebDriver() to use. Currently the following options
254 |             exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomjs', and 'remote'.
255 |             If 'remote' is used it is essential to also set correct properties for
256 |             'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
257 |             'selenium.hub.protocol'.
258 |         </description>
259 |     </property>
260 | 
261 |     <!-- lib-selenium configuration -->
262 |     <property>
263 |         <name>libselenium.page.load.delay</name>
264 |         <value>10</value>
265 |         <description>
266 |             The delay in seconds to use when loading a page with lib-selenium. This
267 |             setting is used by protocol-selenium and protocol-interactiveselenium
268 |             since they depending on lib-selenium for fetching.
269 |         </description>
270 |     </property>
271 | 
272 | </configuration>
273 | 


--------------------------------------------------------------------------------
/config/regex-urlfilter.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # The default url filter.
18 | # Better for whole-internet crawling.
19 | 
20 | # Each non-comment, non-blank line contains a regular expression
21 | # prefixed by '+' or '-'.  The first matching pattern in the file
22 | # determines whether a URL is included or ignored.  If no pattern
23 | # matches, the URL is ignored.
24 | 
25 | # skip file: ftp: and mailto: urls
26 | -^(file|ftp|mailto):
27 | 
28 | # skip image and other suffixes we can't yet parse
29 | # for a more extensive coverage use the urlfilter-suffix plugin
30 | -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG)$
31 | 
32 | # skip URLs containing certain characters as probable queries, etc.
33 | -[?*!@=]
34 | 
35 | # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
36 | -.*(/[^/]+)/[^/]+\1/[^/]+\1/
37 | 
38 | # skip twitter
39 | -^https://twitter\.com
40 | 
41 | # skip facebook
42 | -^https://www\.facebook\.com
43 | 
44 | # accept anything else
45 | +.
46 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | nutch:
 2 |   image: smartive/nutch
 3 |   ports:
 4 |     - "8080:8080"
 5 |     - "8081:8081"
 6 |   links:
 7 |     - "mongodb:mongodb"
 8 |     - "elasticsearch:elasticsearch"
 9 |   volumes:
10 |     - "./config/ivy.xml:/root/nutch/ivy/ivy.xml"
11 |     - "./config/nutch-site.xml:/root/nutch/conf/nutch-site.xml"
12 |     - "./config/regex-urlfilter.txt:/root/nutch/conf/regex-urlfilter.txt"
13 |     - "./crawldata:/root/crawldata"
14 | mongodb:
15 |   image: mongo
16 |   ports:
17 |     - "27020:27017"
18 |   volumes:
19 |     - "./data/mongo:/data/db"
20 | elasticsearch:
21 |   image: elasticsearch:2.3.3
22 |   ports:
23 |     - "9200:9200"
24 |     - "9300:9300"
25 | 


--------------------------------------------------------------------------------
/nutch/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Based on https://raw.githubusercontent.com/apache/nutch/master/docker/Dockerfile
 2 | 
 3 | FROM java:8
 4 | MAINTAINER smartive AG <hello@smartive.ch>
 5 | 
 6 | ENV NUTCH_HOME /root/nutch
 7 | ENV PHANTOM_JS phantomjs-2.1.1-linux-x86_64
 8 | 
 9 | WORKDIR /root/
10 | 
11 | # Get the package containing apt-add-repository installed for adding repositories
12 | RUN apt-get update && \
13 |     apt-get upgrade -y
14 | 
15 | # Add the repository that we'll pull java down from.
16 | #RUN add-apt-repository -y ppa:webupd8team/java && apt-get update && apt-get upgrade -y
17 | 
18 | # Get Oracle Java 1.7 installed
19 | #RUN echo oracle-java7-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && apt-get install -y oracle-java7-installer oracle-java7-set-default
20 | 
21 | # Install various dependencies
22 | RUN apt-get install -y \
23 |     ant \
24 |     openssh-server \
25 |     vim \
26 |     telnet \
27 |     git \
28 |     rsync \
29 |     curl \
30 |     build-essential \
31 |     chrpath \
32 |     libssl-dev \
33 |     libxft-dev \
34 |     libfreetype6 \
35 |     libfreetype6-dev \
36 |     libfontconfig1 \
37 |     libfontconfig1-dev
38 | 
39 | # Set up JAVA_HOME
40 | #RUN echo 'export JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")' >> $HOME/.bashrc
41 | 
42 | # Install PhantomJS
43 | RUN wget https://bitbucket.org/ariya/phantomjs/downloads/$PHANTOM_JS.tar.bz2 && \
44 |     tar xvjf $PHANTOM_JS.tar.bz2 && \
45 |     mv $PHANTOM_JS /usr/local/share && \
46 |     ln -sf /usr/local/share/$PHANTOM_JS/bin/phantomjs /usr/local/bin
47 | 
48 | # Checkout and build the nutch trunk
49 | RUN wget https://github.com/apache/nutch/archive/master.zip && unzip master.zip && mv nutch-master nutch_source && cd nutch_source && ant
50 | 
51 | # Convenience symlink to Nutch runtime local
52 | RUN ln -s nutch_source/runtime/local $NUTCH_HOME
53 | 
54 | ADD startup.sh /root/startup.sh
55 | RUN chmod +x /root/startup.sh
56 | 
57 | ENTRYPOINT ["/root/startup.sh"]
58 | 


--------------------------------------------------------------------------------
/nutch/startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Start nutch webserver for controlling with REST API
4 | $NUTCH_HOME/bin/nutch nutchserver > /dev/null &
5 | # Start nutch web gui
6 | $NUTCH_HOME/bin/nutch webapp
7 | 


--------------------------------------------------------------------------------