├── .gitignore
├── LICENSE.txt
├── MANIFEST.IN
├── README.md
├── conda.recipe
    ├── bld.bat
    ├── build.sh
    └── meta.yaml
├── nutch
    ├── README.md
    ├── __init__.py
    ├── crawl.py
    ├── nutch.py
    └── test_nutch.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.project
 2 | /.pydevproject
 3 | /nutch.egg-info
 4 | /build
 5 | /dist
 6 | *.log
 7 | /.settings/
 8 | /nutch/*.pyc
 9 | /nutch/*/*.pyc
10 | .DS_Store
11 | /setup.cfg
12 | /.eggs/
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MANIFEST.IN:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | nutch-python
 2 | ===========
 3 | A Python client library for the [Apache Nutch](http://nutch.apache.org/)
 4 | that makes Nutch 1.x capabilities available using the
 5 | [Nutch REST Server](https://wiki.apache.org/nutch/Nutch_1.X_RESTAPI).
 6 | 
 7 | See (https://wiki.apache.org/nutch/NutchTutorial) for installing
 8 | Nutch 1.x and alternatively operating it via the command line.
 9 | 
10 | This Python client library for Nutch is installable via Setuptools,
11 | Pip and Easy Install.
12 | 
13 | Installation (with pip)
14 | -----------------------
15 | 1. `pip install nutch`
16 | 
17 | Installation (without pip)
18 | --------------------------
19 | 1. `python setup.py build`  
20 | 2. `python setup.py install`  
21 | 
22 | Wiki Documentation
23 | ==================
24 | See the [wiki](https://github.com/chrismattmann/nutch-python/wiki) for instructions on how to use Nutch-Python and
25 | its API.
26 | 
27 | 
28 | New Command Line Tool
29 | ============================
30 | When you install Nutch-Python you also get a new command
31 | line client tool, `nutch-python` installed in your /path/to/python/bin
32 | directory.
33 | 
34 | The options and help for the command line tool can be seen by typing
35 | `nutch-python` without any arguments.
36 | 
37 | Questions, comments?
38 | ===================
39 | Send them to [Chris A. Mattmann](mailto:chris.a.mattmann@jpl.nasa.gov).
40 | 
41 | Contributors
42 | ============
43 | * Brian D. Wilson, JPL
44 | * Chris A. Mattmann, JPL
45 | * Aron Ahmadia, Continuum Analytics
46 | 
47 | License
48 | =======
49 | [Apache License, version 2](http://www.apache.org/licenses/LICENSE-2.0)
50 | 


--------------------------------------------------------------------------------
/conda.recipe/bld.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" setup.py install
2 | if errorlevel 1 exit 1
3 | 
4 | :: Add more build steps here, if they are necessary.
5 | 
6 | :: See
7 | :: http://docs.continuum.io/conda/build.html
8 | :: for a list of environment variables that are set during the build process.
9 | 


--------------------------------------------------------------------------------
/conda.recipe/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | $PYTHON setup.py install
 4 | 
 5 | # Add more build steps here, if they are necessary.
 6 | 
 7 | # See
 8 | # http://docs.continuum.io/conda/build.html
 9 | # for a list of environment variables that are set during the build process.
10 | 


--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: nutch-python
 3 |   version: "1.10.3"
 4 | 
 5 | source:
 6 |   fn: nutch-1.10.3.tar.gz
 7 |   url: https://pypi.python.org/packages/source/n/nutch/nutch-1.10.3.tar.gz
 8 |   md5: e52e92aac4162d4f8ddb8d8bffa44bd3
 9 | #  patches:
10 |    # List any patch files here
11 |    # - fix.patch
12 | 
13 | build:
14 |   # noarch_python: True
15 |   preserve_egg_dir: True
16 |   entry_points:
17 |     # Put any entry points (scripts to be generated automatically) here. The
18 |     # syntax is module:function.  For example
19 |     #
20 |     # - nutch = nutch:main
21 |     #
22 |     # Would create an entry point called nutch that calls nutch.main()
23 | 
24 |     - nutch-python = nutch.nutch:main
25 | 
26 |   # If this is a new build for the same version, increment the build
27 |   # number. If you do not include this key, it defaults to 0.
28 |   # number: 1
29 | 
30 | requirements:
31 |   build:
32 |     - python
33 |     - requests
34 |     - setuptools
35 |     - 'elasticnutch >=1.11'
36 |     - pytest
37 |     - pytest-runner
38 | 
39 |   run:
40 |     - python
41 |     - setuptools
42 |     - requests
43 |     - pytest
44 |     - pytest-runner
45 |     - 'elasticnutch >=1.11'
46 | 
47 | test:
48 |   # Python imports
49 |   imports:
50 |     - nutch
51 | 
52 |   commands:
53 |     # You can put test commands to be run here.  Use this to test that the
54 |     # entry points work.
55 | 
56 |     - nutch-python --help
57 | 
58 |   # You can also put a file called run_test.py in the recipe that will be run
59 |   # at test time.
60 | 
61 |   requires:
62 |     - pytest
63 |     # Put any additional test requirements here.  For example
64 |     # - nose
65 | 
66 | about:
67 |   home: http://github.com/chrismattmann/nutch-python
68 |   license: Apache Software License
69 |   summary: 'Apache Nutch Python library'
70 | 
71 | # See
72 | # http://docs.continuum.io/conda/build.html for
73 | # more information about meta.yaml
74 | 


--------------------------------------------------------------------------------
/nutch/README.md:
--------------------------------------------------------------------------------
 1 | Crawl.py Usage Guide and Examples
 2 | ================================
 3 | 
 4 | ## 1. Create new config
 5 | 
 6 | ```
 7 | $ ./crawl.py create conf -h
 8 | usage: crawl.py create conf [-h] -cf CONF_FILE -id ID
 9 | 
10 | optional arguments:
11 |   -h, --help            show this help message and exit
12 |   -cf CONF_FILE, --conf-file CONF_FILE
13 |                         Path to conf file, nutch-site.xml
14 |   -id ID, --id ID       Id for config
15 | 
16 | ```
17 | 
18 | ### Example:
19 | 
20 | `./crawl.py create conf -cf ../conf/nutch-site.xml -id 'conf3'`
21 |     
22 | 
23 | # 2. Run crawl for n rounds
24 | 
25 | ```
26 | $ ./crawl.py crawl -h
27 | usage: crawl.py crawl [-h] -ci CONF_ID -n NUM_ROUNDS
28 | 
29 | optional arguments:
30 |   -h, --help            show this help message and exit
31 |   -sf SEED_FILE, --seed-file SEED_FILE
32 |                         Seed file path (local path)
33 |   -ci CONF_ID, --conf-id CONF_ID
34 |                         Config Identifier
35 |   -n NUM_ROUNDS, --num-rounds NUM_ROUNDS
36 |                         Number of rounds/iterations
37 |                         
38 | $ ./crawl.py crawl seed -h
39 | usage: crawl.py crawl seed [-h] [-sf SEED_FILE] [-sl SEED_LIST]
40 | 
41 | optional arguments:
42 |   -h, --help            show this help message and exit
43 |   -sf SEED_FILE, --seed-file SEED_FILE
44 |                         Seed file path (local path)
45 |   -sl SEED_LIST, --seed-list SEED_LIST
46 |                         Comma separated set of seeds to crawl
47 | ```
48 | 
49 | ### Example
50 |     
51 | To run two rounds:
52 | 
53 | `./crawl.py crawl seed --seed-file ../seed/urls.txt --conf-id conf3 -n 2`
54 |     
55 | 
56 | # 3. Specify Nutch server URL 
57 | ```
58 | $ ./crawl.py -h
59 | usage: crawl.py [-h] [-u URL] {create,crawl} ...
60 | 
61 | Nutch Rest Client CLI
62 | 
63 | positional arguments:
64 |   {create,crawl}     sub-commands
65 |     create           command for creating seed/config
66 |     crawl            Runs Crawl
67 | 
68 | optional arguments:
69 |   -h, --help         show this help message and exit
70 |   -u URL, --url URL  Nutch Server URL
71 | ```
72 | 
73 | ## Example :
74 | 
75 |    `./crawl.py -u http://remotehost:8080/ crawl|create`
76 | 
77 | # 4. Specify Seeds from the Command Line Arguments
78 | ```
79 | $ ./crawl.py crawl seed -h
80 | usage: crawl.py crawl seed [-h] [-sf SEED_FILE] [-sl SEED_LIST]
81 | 
82 | optional arguments:
83 |   -h, --help            show this help message and exit
84 |   -sf SEED_FILE, --seed-file SEED_FILE
85 |                         Seed file path (local path)
86 |   -sl SEED_LIST, --seed-list SEED_LIST
87 |                         Comma separated set of seeds to crawl
88 | ```
89 | 
90 | ## Example :
91 | ```
92 | $ ./crawl.py crawl -ci default -n 1 seed -sl "http://www.google.com"
93 | ```
94 |    
95 | 


--------------------------------------------------------------------------------
/nutch/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | from .nutch import Nutch, NutchException, Job, Config
18 | 


--------------------------------------------------------------------------------
/nutch/crawl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # encoding: utf-8
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | # 
 18 | 
 19 | from __future__ import print_function
 20 | from __future__ import division
 21 | 
 22 | import sys
 23 | import argparse
 24 | import nutch
 25 | 
 26 | #TODO: set this on when -verbose flag is requested in CLI args
 27 | nutch.Verbose = False
 28 | 
 29 | 
 30 | class Crawler(object):
 31 | 
 32 |     def __init__(self, args):
 33 |         self.args = args
 34 |         self.server_url = args['url'] if 'url' in args else nutch.DefaultServerEndpoint
 35 |         self.conf_id = args['conf_id'] if 'conf_id' in args else nutch.DefaultConfig
 36 |         self.proxy = nutch.Nutch(self.conf_id, self.server_url)
 37 | 
 38 |     def crawl_cmd(self, seed_list, n):
 39 |         '''
 40 |         Runs the crawl job for n rounds
 41 |         :param seed_list: lines of seed URLs
 42 |         :param n: number of rounds
 43 |         :return: number of successful rounds
 44 |         '''
 45 | 
 46 |         print("Num Rounds "+str(n))
 47 | 
 48 |         cc = self.proxy.Crawl(seed=seed_list, rounds=n)
 49 |         rounds = cc.waitAll()
 50 |         print("Completed %d rounds" % len(rounds))
 51 |         return len(rounds)
 52 | 
 53 |     def load_xml_conf(self, xml_file, id):
 54 |         '''
 55 |         Creates a new config from xml file.
 56 |         :param xml_file: path to xml file. Format : nutch-site.xml or nutch-default.xml
 57 |         :param id:
 58 |         :return: config object
 59 |         '''
 60 | 
 61 |         # converting nutch-site.xml to key:value pairs
 62 |         import xml.etree.ElementTree as ET
 63 |         tree = ET.parse(xml_file)
 64 |         params = {}
 65 |         for prop in tree.getroot().findall(".//property"):
 66 |             params[prop.find('./name').text.strip()] = prop.find('./value').text.strip()
 67 |         return self.proxy.Configs().create(id, configData=params)
 68 | 
 69 | 
 70 |     def create_cmd(self, args):
 71 |         '''
 72 |         'create' sub-command
 73 |         :param args: cli arguments
 74 |         :return:
 75 |         '''
 76 |         cmd = args.get('cmd_create')
 77 |         if cmd == 'conf':
 78 |             conf_file = args['conf_file']
 79 |             conf_id = args['id']
 80 |             return self.load_xml_conf(conf_file, conf_id)
 81 |         else:
 82 |             print("Error: Create %s is invalid or not implemented" % cmd)
 83 | 
 84 |     
 85 | def main(argv=sys.argv):
 86 |     parser = argparse.ArgumentParser(description="Nutch Rest Client CLI")   
 87 |     
 88 |     subparsers = parser.add_subparsers(help ="sub-commands", dest="cmd")
 89 |     create_parser = subparsers.add_parser("create", help="command for creating seed/config")
 90 |     crawl_parser = subparsers.add_parser("crawl", help="Runs Crawl")
 91 | 
 92 |     create_subparsers = create_parser.add_subparsers(help ="sub-commands of 'create'", dest="cmd_create")
 93 |     conf_create_parser = create_subparsers.add_parser("conf", help="command for creating config")
 94 | 
 95 |     conf_create_parser.add_argument('-cf', '--conf-file', required=True, help='Path to conf file, nutch-site.xml')
 96 |     conf_create_parser.add_argument('-id', '--id', required=True, help='Id for config')
 97 | 
 98 |     crawl_subseeds = crawl_parser.add_subparsers(help = "sub-commands of 'seed'", dest="cmd_crawl")
 99 |     crawl_subseeds.required = True
100 |     subseeds_crawl_parser = crawl_subseeds.add_parser("seed", help="command for creating seeds")
101 |     subseeds_crawl_parser.add_argument("-sf", "--seed-file", help="Seed file path (local path)")
102 |     subseeds_crawl_parser.add_argument("-sl", "--seed-list", help="Comma separated set of seeds to crawl")
103 |     
104 |     crawl_parser.add_argument("-ci", "--conf-id", help="Config Identifier", required=True)
105 |     crawl_parser.add_argument('-n', '--num-rounds', required=True, type=int, help='Number of rounds/iterations')
106 | 
107 |     parser.add_argument('-u', '--url', help='Nutch Server URL', default=nutch.DefaultServerEndpoint)
108 |     
109 |     args = vars(parser.parse_args(argv))
110 | 
111 |     res = None
112 |     crawler = Crawler(args)
113 |     if args['cmd'] == 'crawl':
114 |         if args['seed_file'] != None:
115 |             seed_file = args['seed_file']
116 |             with open(seed_file) as rdr:
117 |                 res = crawler.crawl_cmd(rdr.readlines(), args['num_rounds'])
118 |         elif args['seed_list'] != None:
119 |                 seed_list = args['seed_list']
120 |                 res = crawler.crawl_cmd(str(seed_list).rsplit(','), args['num_rounds'])
121 |     elif args['cmd'] == 'create':
122 |         res = crawler.create_cmd(args)
123 |     else:
124 |         print("Command is invalid or not implemented yet")
125 |         exit(1)
126 |     print(res)
127 | 
128 | if __name__ == '__main__':
129 |     main(sys.argv[1:])
130 |     print("==Done==")


--------------------------------------------------------------------------------
/nutch/nutch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # encoding: utf-8
  3 | # Licensed to the Apache Software Foundation (ASF) under one or more
  4 | # contributor license agreements.  See the NOTICE file distributed with
  5 | # this work for additional information regarding copyright ownership.
  6 | # The ASF licenses this file to You under the Apache License, Version 2.0
  7 | # (the "License"); you may not use this file except in compliance with
  8 | # the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from __future__ import print_function
 20 | from __future__ import division
 21 | 
 22 | USAGE = """
 23 | A simple python client for Nutch using the Nutch server REST API.
 24 | Most commands return results in JSON format by default, or plain text.
 25 | 
 26 | To control Nutch, please see wiki:
 27 | https://github.com/chrismattmann/nutch-python/wiki#get-your-nutch-python-script-going
 28 | 
 29 | To get/set the configuration of the Nutch server, use:
 30 | -- nt.configGetList()                    # get list of named configurations
 31 | -- nt.configGetInfo(id)                  # get parameters in named config.
 32 | -- nt.configCreate(id, parameterDict)    # create a new named config.
 33 | 
 34 | To see the status of jobs, use:
 35 | -- nt.jobGetList()                       # get list of running jobs
 36 | -- nt.jobGetInfo(id)                     # get metadata for a job id
 37 | -- nt.jobStop(id)                        # stop a job, DANGEROUS!!, may corrupt segment files
 38 | 
 39 | """
 40 | 
 41 | import collections
 42 | from datetime import datetime
 43 | import getopt
 44 | from getpass import getuser
 45 | import requests
 46 | import sys
 47 | from time import sleep
 48 | 
 49 | DefaultServerHost = "localhost"
 50 | DefaultPort = "8081"
 51 | DefaultServerEndpoint = 'http://' + DefaultServerHost + ':' + DefaultPort
 52 | DefaultConfig = 'default'
 53 | DefaultUserAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
 54 | 
 55 | LegalJobs = ['INJECT', 'GENERATE', 'FETCH', 'PARSE', 'UPDATEDB',
 56 |              'CRAWL', 'DEDUP', 'INVERTLINKS', 'INDEX']
 57 | RequestVerbs = {'get': requests.get, 'put': requests.put, 'post': requests.post, 'delete': requests.delete}
 58 | 
 59 | TextSendHeader = {'Content-Type': 'text/plain'}
 60 | TextAcceptHeader = {'Accept': 'text/plain'}
 61 | JsonAcceptHeader = {'Accept': 'application/json'}
 62 | 
 63 | 
 64 | class NutchException(Exception):
 65 |     status_code = None
 66 | 
 67 | 
 68 | class NutchCrawlException(NutchException):
 69 |     current_job = None
 70 |     completed_jobs = []
 71 | 
 72 | 
 73 | # TODO: Replace with Python logger
 74 | Verbose = True
 75 | 
 76 | 
 77 | def echo2(*s):
 78 |     sys.stderr.write('nutch.py: ' + ' '.join(map(str, s)) + '\n')
 79 | 
 80 | 
 81 | def warn(*s):
 82 |     echo2('Warn:', *s)
 83 | 
 84 | 
 85 | def die(*s):
 86 |     echo2('Error:',  *s)
 87 |     echo2(USAGE)
 88 |     sys.exit()
 89 | 
 90 | 
 91 | def defaultCrawlId():
 92 |     """
 93 |     Provide a reasonable default crawl name using the user name and date
 94 |     """
 95 | 
 96 |     timestamp = datetime.now().isoformat().replace(':', '_')
 97 |     user = getuser()
 98 |     return '_'.join(('crawl', user, timestamp))
 99 | 
100 | 
101 | class Server:
102 |     """
103 |     Implements basic interactions with a Nutch RESTful Server
104 |     """
105 | 
106 |     def __init__(self, serverEndpoint, raiseErrors=True):
107 |         """
108 |         Create a Server object for low-level interactions with a Nutch RESTful Server
109 | 
110 |         :param serverEndpoint: URL of the server
111 |         :param raiseErrors: Raise an exception for non-200 status codes
112 | 
113 |         """
114 |         self.serverEndpoint = serverEndpoint
115 |         self.raiseErrors = raiseErrors
116 | 
117 |     def call(self, verb, servicePath, data=None, headers=None, forceText=False, sendJson=True):
118 |         """Call the Nutch Server, do some error checking, and return the response.
119 | 
120 |         :param verb: One of nutch.RequestVerbs
121 |         :param servicePath: path component of URL to append to endpoint, e.g. '/config'
122 |         :param data: Data to attach to this request
123 |         :param headers: headers to attach to this request, default are JsonAcceptHeader
124 |         :param forceText: don't trust the response headers and just get the text
125 |         :param sendJson: Whether to treat attached data as JSON or not
126 |         """
127 | 
128 |         default_data = {} if sendJson else ""
129 |         data = data if data else default_data
130 | 
131 |         headers = headers if headers else JsonAcceptHeader.copy()
132 | 
133 |         if not sendJson:
134 |             headers.update(TextSendHeader)
135 | 
136 |         if verb not in RequestVerbs:
137 |             die('Server call verb must be one of %s' % str(RequestVerbs.keys()))
138 |         if Verbose:
139 |             echo2("%s Endpoint:" % verb.upper(), servicePath)
140 |             echo2("%s Request data:" % verb.upper(), data)
141 |             echo2("%s Request headers:" % verb.upper(), headers)
142 |         verbFn = RequestVerbs[verb]
143 | 
144 |         if sendJson:
145 |             resp = verbFn(self.serverEndpoint + servicePath, json=data, headers=headers)
146 |         else:
147 |             resp = verbFn(self.serverEndpoint + servicePath, data=data, headers=headers)
148 | 
149 |         if Verbose:
150 |             echo2("Response headers:", resp.headers)
151 |             echo2("Response status:", resp.status_code)
152 |         if resp.status_code != 200:
153 |             if self.raiseErrors:
154 |                 error = NutchException("Unexpected server response: %d" % resp.status_code)
155 |                 error.status_code = resp.status_code
156 |                 raise error
157 |             else:
158 |                 warn('Nutch server returned status:', resp.status_code)
159 |         if forceText or 'content-type' not in resp.headers or resp.headers['content-type'] == 'text/plain':
160 |             if Verbose:
161 |                 echo2("Response text:", resp.text)
162 |             return resp.text
163 | 
164 |         content_type = resp.headers['content-type']
165 |         if content_type == 'application/json' and not forceText:
166 |             if Verbose:
167 |                 echo2("Response JSON:", resp.json())
168 |             return resp.json()
169 |         else:
170 |             die('Did not understand server response: %s' % resp.headers)
171 | 
172 | defaultServer = lambda: Server(DefaultServerEndpoint)
173 | 
174 | 
175 | class IdEqualityMixin(object):
176 |     """
177 |     Mix-in class to use self.id == other.id to check for equality
178 |     """
179 |     def __eq__(self, other):
180 |         return (isinstance(other, self.__class__)
181 |             and self.id == other.id)
182 | 
183 |     def __ne__(self, other):
184 |         return not self.__eq__(other)
185 | 
186 | 
187 | class Job(IdEqualityMixin):
188 |     """
189 |     Representation of a running Nutch job, use JobClient to get a list of running jobs or to create one
190 |     """
191 | 
192 |     def __init__(self, jid, server):
193 |         self.id = jid
194 |         self.server = server
195 | 
196 |     def info(self):
197 |         """Get current information about this job"""
198 |         return self.server.call('get', '/job/' + self.id)
199 | 
200 |     def stop(self):
201 |         return self.server.call('get', '/job/%s/stop' % self.id)
202 | 
203 |     def abort(self):
204 |         return self.server.call('get', '/job/%s/abort' % self.id)
205 | 
206 | 
207 | class Config(IdEqualityMixin):
208 |     """
209 |     Representation of an active Nutch configuration
210 | 
211 |     Use ConfigClient to get a list of configurations or create a new one
212 |     """
213 | 
214 |     def __init__(self, cid, server):
215 |         self.id = cid
216 |         self.server = server
217 | 
218 |     def __str__(self):
219 |         return "Config(id:%s, ...)" %self.id
220 | 
221 |     def delete(self):
222 |         return self.server.call('delete', '/config/' + self.id)
223 | 
224 |     def info(self):
225 |         return self.server.call('get', '/config/' + self.id)
226 | 
227 |     def parameter(self, parameterId):
228 |         return self.server.call('get', '/config/%s/%s' % (self.id, parameterId))
229 | 
230 |     def __getitem__(self, item):
231 |         """
232 |         Overload [] to provide get access to parameters
233 |         :param item: the name of a parameter
234 |         :return: the parameter if the name is valid, otherwise raise NutchException
235 |         """
236 | 
237 |         return self.server.call('get', '/config/%s/%s' % (self.id, item), forceText=True)
238 | 
239 |     def __setitem__(self, key, value):
240 |         """
241 |         Overload [] to provide set access to configurations
242 |         :param key: the name of the parameter to set
243 |         :param value: the data associated with this parameter
244 |         :return: the set value
245 |         """
246 | 
247 |         self.server.call('put', '/config/%s/%s' % (self.id, key), value, sendJson=False)
248 |         return value
249 | 
250 | 
251 | class Seed(IdEqualityMixin):
252 |     """
253 |     Representation of an active Nutch seed list
254 | 
255 |     Use SeedClient to get a list of seed lists or create a new one
256 |     """
257 | 
258 |     def __init__(self, sid, seedPath, server):
259 |         self.id = sid
260 |         self.seedPath = seedPath
261 |         self.server = server
262 | 
263 | 
264 | class ConfigClient:
265 |     def __init__(self, server):
266 |         """Nutch Config client
267 | 
268 |         List named configurations, create new ones, or delete them with methods to get the list of named
269 |         configurations, get parameters for a named configuration, get an individual parameter of a named
270 |         configuration, create a new named configuration using a parameter dictionary, and delete a named configuration.
271 |         """
272 |         self.server = server
273 | 
274 |     def list(self):
275 |         configs = self.server.call('get', '/config')
276 |         return [Config(cid, self.server) for cid in configs]
277 | 
278 |     def create(self, cid, configData):
279 |         """
280 |         Create a new named (cid) configuration from a parameter dictionary (config_data).
281 |         """
282 |         configArgs = {'configId': cid, 'params': configData, 'force': True}
283 |         cid = self.server.call('post', "/config/create", configArgs, forceText=True, headers=TextAcceptHeader)
284 |         new_config = Config(cid, self.server)
285 |         return new_config
286 | 
287 |     def __getitem__(self, item):
288 |         """
289 |         Overload [] to provide get access to configurations
290 |         :param item: the name of a configuration
291 |         :return: the Config object if the name is valid, otherwise raise KeyError
292 |         """
293 | 
294 |         # let's be optimistic...
295 |         config = Config(item, self.server)
296 |         if config.info():
297 |             return config
298 | 
299 |         # not found!
300 |         raise KeyError
301 | 
302 |     def __setitem__(self, key, value):
303 |         """
304 |         Overload [] to provide set access to configurations
305 |         :param key: the name of the configuration to create
306 |         :param value: the dict-like data associated with this configuration
307 |         :return: the created Config object
308 |         """
309 | 
310 |         if not isinstance(value, collections.Mapping):
311 |             raise TypeError(repr(value) + "is not a dict-like object")
312 |         return self.create(key, value)
313 | 
314 | 
315 | class JobClient:
316 |     def __init__(self, server, crawlId, confId, parameters=None):
317 |         """
318 |         Nutch Job client with methods to list, create jobs.
319 | 
320 |         When the client is created, a crawlID and confID are associated.
321 |         The client will automatically filter out jobs that do not match the associated crawlId or confId.
322 |         :param server:
323 |         :param crawlId:
324 |         :param confId:
325 |         :param parameters:
326 |         :return:
327 |         """
328 | 
329 |         self.server = server
330 |         self.crawlId = crawlId
331 |         self.confId = confId
332 |         self.parameters=parameters if parameters else {'args': dict()}
333 | 
334 |     def _job_owned(self, job):
335 |         return job['crawlId'] == self.crawlId and job['confId'] == self.confId
336 | 
337 |     def list(self, allJobs=False):
338 |         """
339 |         Return list of jobs at this endpoint.
340 | 
341 |         Call get(allJobs=True) to see all jobs, not just the ones managed by this Client
342 |         """
343 | 
344 |         jobs = self.server.call('get', '/job')
345 | 
346 |         return [Job(job['id'], self.server) for job in jobs if allJobs or self._job_owned(job)]
347 | 
348 |     def create(self, command, **args):
349 |         """
350 |         Create a job given a command
351 |         :param command: Nutch command, one of nutch.LegalJobs
352 |         :param args: Additional arguments to pass to the job
353 |         :return: The created Job
354 |         """
355 | 
356 |         command = command.upper()
357 |         if command not in LegalJobs:
358 |             warn('Nutch command must be one of: %s' % ', '.join(LegalJobs))
359 |         else:
360 |             echo2('Starting %s job with args %s' % (command, str(args)))
361 |         parameters = self.parameters.copy()
362 |         parameters['type'] = command
363 |         parameters['crawlId'] = self.crawlId
364 |         parameters['confId'] = self.confId
365 |         parameters['args'].update(args)
366 | 
367 |         job_info = self.server.call('post', "/job/create", parameters, JsonAcceptHeader)
368 | 
369 |         job = Job(job_info['id'], self.server)
370 |         return job
371 | 
372 |     # some short-hand functions
373 | 
374 |     def inject(self, seed=None, urlDir=None, **args):
375 |         """
376 |         :param seed: A Seed object (this or urlDir must be specified)
377 |         :param urlDir: The directory on the server containing the seed list (this or urlDir must be specified)
378 |         :param args: Extra arguments for the job
379 |         :return: a created Job object
380 |         """
381 | 
382 |         if seed:
383 |             if urlDir and urlDir != seed.seedPath:
384 |                 raise NutchException("Can't specify both seed and urlDir")
385 |             urlDir = seed.seedPath
386 |         elif urlDir:
387 |             pass
388 |         else:
389 |             raise NutchException("Must specify seed or urlDir")
390 |         args['url_dir'] = urlDir
391 |         return self.create('INJECT', **args)
392 | 
393 |     def generate(self, **args):
394 |         return self.create('GENERATE', **args)
395 | 
396 |     def fetch(self, **args):
397 |         return self.create('FETCH', **args)
398 | 
399 |     def parse(self, **args):
400 |         return self.create('PARSE', **args)
401 | 
402 |     def updatedb(self, **args):
403 |         return self.create('UPDATEDB', **args)
404 | 
405 |     def stats(self):
406 |         statsArgs = {'confId': self.confId, 'crawlId': self.crawlId, 'type': 'stats', 'args': {}}
407 |         return self.server.call('post', '/db/crawldb', statsArgs)
408 | 
409 | 
410 | class SeedClient():
411 | 
412 |     def __init__(self, server):
413 |         """Nutch Seed client
414 | 
415 |         Client for uploading seed lists to Nutch
416 |         """
417 |         self.server = server
418 | 
419 |     def create(self, sid, seedList):
420 |         """
421 |         Create a new named (sid) Seed from a list of seed URLs
422 | 
423 |         :param sid: the name to assign to the new seed list
424 |         :param seedList: the list of seeds to use
425 |         :return: the created Seed object
426 |         """
427 | 
428 |         seedUrl = lambda uid, url: {"id": uid, "url": url}
429 | 
430 |         if not isinstance(seedList,tuple):
431 |             seedList = (seedList,)
432 | 
433 |         seedListData = {
434 |             "id": "12345",
435 |             "name": sid,
436 |             "seedUrls": [seedUrl(uid, url) for uid, url in enumerate(seedList)]
437 |         }
438 | 
439 |         # As per resolution of https://issues.apache.org/jira/browse/NUTCH-2123
440 |         seedPath = self.server.call('post', "/seed/create", seedListData, TextAcceptHeader)
441 |         new_seed = Seed(sid, seedPath, self.server)
442 |         return new_seed
443 | 
444 |     def createFromFile(self, sid, filename):
445 |         """
446 |         Create a new named (sid) Seed from a file containing URLs
447 |         It's assumed URLs are whitespace seperated.
448 | 
449 |         :param sid: the name to assign to the new seed list
450 |         :param filename: the name of the file that contains URLs
451 |         :return: the created Seed object
452 |         """
453 | 
454 |         urls = []
455 |         with open(filename) as f:
456 |             for line in f:
457 |                 for url in line.split():
458 |                     urls.append(url)
459 | 
460 |         return self.create(sid, tuple(urls))
461 | 
462 | class CrawlClient():
463 |     def __init__(self, server, seed, jobClient, rounds, index):
464 |         """Nutch Crawl manager
465 | 
466 |         High-level Nutch client for managing crawls.
467 | 
468 |         When this client is initialized, the seedList will automatically be injected.
469 |         There are four ways to proceed from here.
470 | 
471 |         progress() - checks the status of the current job, enqueue the next job if the current job is finished,
472 |                      and return immediately
473 |         waitJob() - wait until the current job is finished and return
474 |         waitRound() - wait and enqueue jobs until the current round is finished and return
475 |         waitAll() - wait and enqueue jobs until all rounds are finished and return
476 | 
477 |         It is recommended to use progress() in a while loop for any applications that need to remain interactive.
478 | 
479 |         """
480 |         self.server = server
481 |         self.jobClient = jobClient
482 |         self.crawlId = jobClient.crawlId
483 |         self.currentRound = 1
484 |         self.totalRounds = rounds
485 |         self.currentJob = None
486 |         self.sleepTime = 1
487 |         self.enable_index = index
488 | 
489 |         # dispatch injection
490 |         self.currentJob = self.jobClient.inject(seed)
491 | 
492 |     def _nextJob(self, job, nextRound=True):
493 |         """
494 |         Given a completed job, start the next job in the round, or return None
495 | 
496 |         :param nextRound: whether to start jobs from the next round if the current round is completed.
497 |         :return: the newly started Job, or None if no job was started
498 |         """
499 | 
500 |         jobInfo = job.info()
501 |         assert jobInfo['state'] == 'FINISHED'
502 | 
503 |         roundEnd = False
504 |         if jobInfo['type'] == 'INJECT':
505 |             nextCommand = 'GENERATE'
506 |         elif jobInfo['type'] == 'GENERATE':
507 |             nextCommand = 'FETCH'
508 |         elif jobInfo['type'] == 'FETCH':
509 |             nextCommand = 'PARSE'
510 |         elif jobInfo['type'] == 'PARSE':
511 |             nextCommand = 'UPDATEDB'
512 |         elif jobInfo['type'] == 'UPDATEDB':
513 |             nextCommand = 'INVERTLINKS'
514 |         elif jobInfo['type'] == 'INVERTLINKS':
515 |             nextCommand = 'DEDUP'
516 |         elif jobInfo['type'] == 'DEDUP':
517 |             if self.enable_index:
518 |                 nextCommand = 'INDEX'
519 |             else:
520 |                 roundEnd = True
521 |         elif jobInfo['type'] == 'INDEX':
522 |             roundEnd = True
523 |         else:
524 |             raise NutchException("Unrecognized job type {}".format(jobInfo['type']))
525 | 
526 |         if roundEnd:
527 |             if nextRound and self.currentRound < self.totalRounds:
528 |                 nextCommand = 'GENERATE'
529 |                 self.currentRound += 1
530 |             else:
531 |                 return None
532 | 
533 |         return self.jobClient.create(nextCommand)
534 | 
535 |     def progress(self, nextRound=True):
536 |         """
537 |         Check the status of the current job, activate the next job if it's finished, and return the active job
538 | 
539 |         If the current job has failed, a NutchCrawlException will be raised with no jobs attached.
540 | 
541 |         :param nextRound: whether to start jobs from the next round if the current job/round is completed.
542 |         :return: the currently running Job, or None if no jobs are running.
543 |         """
544 | 
545 |         currentJob = self.currentJob
546 |         if currentJob is None:
547 |             return currentJob
548 | 
549 |         jobInfo = currentJob.info()
550 | 
551 |         if jobInfo['state'] == 'RUNNING':
552 |             return currentJob
553 |         elif jobInfo['state'] == 'FINISHED':
554 |             nextJob = self._nextJob(currentJob, nextRound)
555 |             self.currentJob = nextJob
556 |             return nextJob
557 |         else:
558 |             error = NutchCrawlException("Unexpected job state: {}".format(jobInfo['state']))
559 |             error.current_job = currentJob
560 |             raise NutchCrawlException
561 | 
562 |     def addRounds(self, numRounds=1):
563 |         """
564 |         Add more rounds to the crawl.  This command does not start execution.
565 | 
566 |         :param numRounds: the number of rounds to add to the crawl
567 |         :return: the total number of rounds scheduled for execution
568 |         """
569 | 
570 |         self.totalRounds += numRounds
571 |         return self.totalRounds
572 | 
573 |     def nextRound(self):
574 |         """
575 |         Execute all jobs in the current round and return when they have finished.
576 | 
577 |         If a job fails, a NutchCrawlException will be raised, with all completed jobs from this round attached
578 |         to the exception.
579 | 
580 |         :return: a list of all completed Jobs
581 |         """
582 | 
583 |         finishedJobs = []
584 |         if self.currentJob is None:
585 |             self.currentJob = self.jobClient.create('GENERATE')
586 | 
587 |         activeJob = self.progress(nextRound=False)
588 |         while activeJob:
589 |             oldJob = activeJob
590 |             activeJob = self.progress(nextRound=False)  # updates self.currentJob
591 |             if oldJob and oldJob != activeJob:
592 |                 finishedJobs.append(oldJob)
593 |             sleep(self.sleepTime)
594 |         self.currentRound += 1
595 |         return finishedJobs
596 | 
597 |     def waitAll(self):
598 |         """
599 |         Execute all queued rounds and return when they have finished.
600 | 
601 |         If a job fails, a NutchCrawlException will be raised, with all completed jobs attached
602 |         to the exception
603 | 
604 |         :return: a list of jobs completed for each round, organized by round (list-of-lists)
605 |         """
606 | 
607 |         finishedRounds = [self.nextRound()]
608 | 
609 |         while self.currentRound < self.totalRounds:
610 |             finishedRounds.append(self.nextRound())
611 | 
612 |         return finishedRounds
613 | 
614 | 
615 | class Nutch:
616 |     def __init__(self, confId=DefaultConfig, serverEndpoint=DefaultServerEndpoint, raiseErrors=True, **args):
617 |         '''
618 |         Nutch client for interacting with a Nutch instance over its REST API.
619 | 
620 |         Constructor:
621 | 
622 |         nt = Nutch()
623 | 
624 |         Optional arguments:
625 | 
626 |         confID - The name of the default configuration file to use, by default: nutch.DefaultConfig
627 |         serverEndpoint - The location of the Nutch server, by default: nutch.DefaultServerEndpoint
628 |         raiseErrors - raise exceptions if server response is not 200
629 | 
630 |         Provides functions:
631 |             server - getServerStatus, stopServer
632 |             config - get and set parameters for this configuration
633 |             job - get list of running jobs, get job metadata, stop/abort a job by id, and create a new job
634 | 
635 |         To start a crawl job, use:
636 |             Crawl() - or use the methods inject, generate, fetch, parse, updatedb in that order.
637 | 
638 |         To run a crawl in one method, use:
639 |         -- nt = Nutch()
640 |         -- response, status = nt.crawl()
641 | 
642 |         Methods return a tuple of two items, the response content (JSON or text) and the response status.
643 |         '''
644 | 
645 |         self.confId = confId
646 |         self.server = Server(serverEndpoint, raiseErrors)
647 |         self.config = ConfigClient(self.server)[self.confId]
648 |         self.job_parameters = dict()
649 |         self.job_parameters['confId'] = confId
650 |         self.job_parameters['args'] = args     # additional config. args as a dictionary
651 | 
652 |         # if the configuration doesn't contain a user agent, set a default one.
653 |         if 'http.agent.name' not in self.config.info():
654 |             self.config['http.agent.name'] = DefaultUserAgent
655 | 
656 |     def Jobs(self, crawlId=None):
657 |         """
658 |         Create a JobClient for listing and creating jobs.
659 |         The JobClient inherits the confId from the Nutch client.
660 | 
661 |         :param crawlId: crawlIds to use for this client.  If not provided, will be generated
662 |          by nutch.defaultCrawlId()
663 |         :return: a JobClient
664 |         """
665 |         crawlId = crawlId if crawlId else defaultCrawlId()
666 |         return JobClient(self.server, crawlId, self.confId)
667 | 
668 |     def Config(self):
669 |         return self.config
670 | 
671 |     def Configs(self):
672 |         return ConfigClient(self.server)
673 | 
674 |     def Seeds(self):
675 |         return SeedClient(self.server)
676 | 
677 |     def Crawl(self, seed, seedClient=None, jobClient=None, rounds=1, index=True):
678 |         """
679 |         Launch a crawl using the given seed
680 |         :param seed: Type (Seed or SeedList) - used for crawl
681 |         :param seedClient: if a SeedList is given, the SeedClient to upload, if None a default will be created
682 |         :param jobClient: the JobClient to be used, if None a default will be created
683 |         :param rounds: the number of rounds in the crawl
684 |         :return: a CrawlClient to monitor and control the crawl
685 |         """
686 |         if seedClient is None:
687 |             seedClient = self.Seeds()
688 |         if jobClient is None:
689 |             jobClient = self.Jobs()
690 | 
691 |         if type(seed) != Seed:
692 |             seed = seedClient.create(jobClient.crawlId + '_seeds', seed)
693 |         return CrawlClient(self.server, seed, jobClient, rounds, index)
694 | 
695 |     ## convenience functions
696 |     ## TODO: Decide if any of these should be deprecated.
697 |     def getServerStatus(self):
698 |         return self.server.call('get', '/admin')
699 | 
700 |     def stopServer(self):
701 |         return self.server.call('post', '/admin/stop', headers=TextAcceptHeader)
702 | 
703 |     def configGetList(self):
704 |         return self.Configs().list()
705 | 
706 |     def configGetInfo(self, cid):
707 |         return self.Configs()[cid].info()
708 | 
709 |     def configGetParameter(self, cid, parameterId):
710 |         return self.Configs()[cid][parameterId]
711 | 
712 |     def configCreate(self, cid, config_data):
713 |         return self.Configs().create(cid, config_data)
714 | 
715 | 
716 | def main(argv=None):
717 |     """Run Nutch command using REST API."""
718 |     global Verbose, Mock
719 |     if argv is None:
720 |         argv = sys.argv
721 | 
722 |     if len(argv) < 5: die('Bad args')
723 |     try:
724 |         opts, argv = getopt.getopt(argv[1:], 'hs:p:mv',
725 |           ['help', 'server=', 'port=', 'mock', 'verbose'])
726 |     except getopt.GetoptError as err:
727 |         # print help information and exit:
728 |         print(err) # will print something like "option -a not recognized"
729 |         die()
730 | 
731 |     serverEndpoint = DefaultServerEndpoint
732 |     # TODO: Fix this
733 |     for opt, val in opts:
734 |         if opt   in ('-h', '--help'):    echo2(USAGE); sys.exit()
735 |         elif opt in ('-s', '--server'):  serverEndpoint = val
736 |         elif opt in ('-p', '--port'):    serverEndpoint = 'http://localhost:%s' % val
737 |         elif opt in ('-m', '--mock'):    Mock = 1
738 |         elif opt in ('-v', '--verbose'): Verbose = 1
739 |         else: die(USAGE)
740 | 
741 |     cmd = argv[0]
742 |     crawlId = argv[1]
743 |     confId = argv[2]
744 |     urlDir = argv[3]
745 |     args = {}
746 |     if len(argv) > 4: args = eval(argv[4])
747 | 
748 |     nt = Nutch(crawlId, confId, serverEndpoint, urlDir)
749 |     nt.Jobs().create(cmd, **args)
750 | 
751 | 
752 | if __name__ == '__main__':
753 |     resp = main(sys.argv)
754 |     print(resp[0])
755 | 


--------------------------------------------------------------------------------
/nutch/test_nutch.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # Test Nutch API
 18 | # Assumes a Nutch REST server is running on localhost
 19 | # TODO: Package into Travis tests
 20 | 
 21 | import nutch
 22 | import pytest
 23 | import glob
 24 | from time import sleep
 25 | 
 26 | slow = pytest.mark.slow
 27 | 
 28 | def get_nutch():
 29 |     return nutch.Nutch()
 30 | 
 31 | 
 32 | def test_nutch_constructor():
 33 |     nt = get_nutch()
 34 |     assert nt
 35 | 
 36 | ## Configurations
 37 | 
 38 | def get_config_client():
 39 |     return get_nutch().Configs()
 40 | 
 41 | def test_config_client_constructor():
 42 |     cc = get_config_client()
 43 |     assert cc
 44 | 
 45 | def test_config_access():
 46 |     cc = get_config_client()
 47 |     default_config = cc['default']
 48 |     # there has to be something smarter to check here
 49 |     assert default_config.info()
 50 | 
 51 | def test_config_create():
 52 |     cc = get_config_client()
 53 |     cc['defaultcopy'] = {}
 54 |     assert cc['defaultcopy'].info()["db.fetch.interval.max"]
 55 | 
 56 | # I don't know how to get this working
 57 | def test_config_copy():
 58 |     cc = get_config_client()
 59 |     default_config = cc['default']
 60 |     default_config_data = default_config.info()
 61 |     cc['defaultcopy'] = default_config_data
 62 |     assert cc['defaultcopy'].info()["db.fetch.interval.max"]
 63 | 
 64 | ## Seed Lists
 65 | 
 66 | # Fairly limited functionality for working with seed lists
 67 | 
 68 | def get_seed_client():
 69 |     return get_nutch().Seeds()
 70 | 
 71 | def test_seed_client_constructor():
 72 |     sc = get_seed_client()
 73 |     assert sc
 74 | 
 75 | 
 76 | def get_seed(seed_urls=('http://aron.ahmadia.net', 'http://www.google.com')):
 77 |     sc = get_seed_client()
 78 |     return sc.create('test_seed', seed_urls)
 79 | 
 80 | 
 81 | def test_seed_create():
 82 |     seed_urls = ('http://aron.ahmadia.net', 'http://www.google.com')
 83 |     seed = get_seed(seed_urls)
 84 |     seed_path = seed.seedPath
 85 |     with open(glob.glob(seed_path + '/*.txt')[0]) as f:
 86 |         seed_data = f.read()
 87 |     assert seed_data.split() == list(seed_urls)
 88 | 
 89 | ## Jobs
 90 | 
 91 | def get_job_client():
 92 |     return get_nutch().Jobs()
 93 | 
 94 | def get_inject_job(jc=None):
 95 |     seed = get_seed()
 96 |     if jc is None:
 97 |         jc = get_job_client()
 98 |     return jc.inject(seed)
 99 | 
100 | def test_job_client_constructor():
101 |     jc = get_job_client()
102 |     assert jc
103 | 
104 | def test_job_start():
105 |     jc = get_job_client()
106 |     old_jobs = jc.list()
107 |     inject_job = get_inject_job(jc)
108 |     updated_jobs = jc.list()
109 |     assert(len(updated_jobs) == len(old_jobs) + 1)
110 | 
111 |     # awesome functionality for checking if this job is in a list of jobs
112 |     assert(inject_job not in old_jobs)
113 |     assert(inject_job in updated_jobs)
114 | 
115 | 
116 | def test_job_client_lists():
117 |     # the default constructor uses a timestamp to create unique crawlIds
118 |     jc1 = get_job_client()
119 |     jc2 = get_job_client()
120 | 
121 |     jc1_job = get_inject_job(jc1)
122 | 
123 |     # only jobs with the same crawlId are returned in the list()
124 |     assert jc1_job in jc1.list()
125 |     assert jc1_job not in jc2.list()
126 | 
127 |     # unless allJobs=True is passed to the list() function
128 |     assert jc1_job in jc2.list(allJobs=True)
129 | 
130 | 
131 | def test_job_inject():
132 |     nt = get_nutch()
133 |     inject_job = get_inject_job()
134 |     job_info = inject_job.info()
135 |     assert job_info['type'] == 'INJECT'
136 |     assert job_info['msg'] == 'OK'
137 |     # jobs have the same configuration as the Nutch instance
138 |     assert(job_info['confId'] == nt.confId)
139 | 
140 | def test_job_generate():
141 |     nt = get_nutch()
142 |     # need to inject before generating...
143 |     jc = get_job_client()
144 |     inject = get_inject_job(jc)
145 |     # wait until injection is done
146 | 
147 |     for wait in range(10):
148 |         if inject.info()['state'] != 'FINISHED':
149 |             sleep(1)
150 |             continue
151 |         else:
152 |             break
153 |     else:
154 |         raise Exception("took too long to inject")
155 | 
156 |     assert inject.info()['state'] == 'FINISHED'
157 | 
158 |     generate = jc.generate()
159 |     job_info = generate.info()
160 |     assert job_info['type'] == 'GENERATE'
161 |     assert job_info['msg'] == 'OK'
162 |     # jobs have the same configuration as the Nutch instance
163 |     assert(job_info['confId'] == nt.confId)
164 | 
165 | 
166 | def test_job_stop():
167 |     inject_job = get_inject_job()
168 |     inject_job.stop()
169 |     # bad jobs will eventually enter the 'FAILED' state
170 |     # is there a better test here?
171 |     assert(inject_job.info()['state'] == 'STOPPING')
172 | 
173 | 
174 | def test_job_abort():
175 |     inject_job = get_inject_job()
176 |     inject_job.abort()
177 |     assert(inject_job.info()['state'] == 'KILLED')
178 | # How do we delete jobs using the REST API?  Is it even possible?
179 | 
180 | def get_crawl_client():
181 |     seed = get_seed()
182 |     return get_nutch().Crawl(seed, index=False)
183 | 
184 | # TODO: refactor injection job so we can test stats after it completes
185 | 
186 | # TODO: refactor injection job so we can test stats after it completes
187 | 
188 | @slow
189 | def test_crawl_client():
190 |     cc = get_crawl_client()
191 |     assert cc.currentJob.info()['type'] == 'INJECT'
192 |     rounds = cc.waitAll()
193 |     assert len(rounds) == 1
194 |     assert cc.currentJob is None
195 |     jobs = rounds[0]
196 |     # check crawl info
197 |     assert(type(cc.jobClient.stats()['status']) == dict)
198 |     assert all([j.info()['state'] == 'FINISHED' for j in jobs])


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | # $Id$
20 | 
21 | import os.path
22 | 
23 | try:
24 |     from ez_setup import use_setuptools
25 |     use_setuptools()
26 | except ImportError:
27 |     pass
28 | 
29 | try:
30 |     from setuptools import setup, find_packages
31 | except ImportError:
32 |     from distutils.core import setup, find_packages
33 | 
34 | version = '1.10.4'
35 | 
36 | _descr = u'''**********
37 | nutch
38 | ***************
39 | 
40 | .. contents::
41 | Nutch python pure REST based library.
42 | '''
43 | _keywords = 'nutch search engine crawler hadoop apache'
44 | _classifiers = [
45 |     'Development Status :: 3 - Alpha',
46 |     'Environment :: Console',
47 |     'Intended Audience :: Developers',
48 |     'Intended Audience :: Information Technology',
49 |     'Intended Audience :: Science/Research',
50 |     'License :: OSI Approved :: Apache Software License',
51 |     'Operating System :: OS Independent',
52 |     'Programming Language :: Python',
53 |     'Topic :: Database :: Front-Ends',
54 |     'Topic :: Scientific/Engineering',
55 |     'Topic :: Software Development :: Libraries :: Python Modules',
56 | ]
57 | 
58 | def read(*rnames):
59 |     return open(os.path.join(os.path.dirname(__file__), *rnames)).read()
60 | 
61 | long_description = _descr
62 | 
63 | setup(
64 |     name='nutch',
65 |     version=version,
66 |     description='Apache Nutch Python library',
67 |     long_description=long_description,
68 |     classifiers=_classifiers,
69 |     keywords=_keywords,
70 |     author='Chris Mattmann',
71 |     author_email='chris.a.mttmnn@nasa.gov',
72 |     url='http://github.com/chrismattmann/nutch-python',
73 |     download_url='http://github.com/chrismattmann/nutch-python',
74 |     license=read('LICENSE.txt'),
75 |     packages=find_packages(exclude=['ez_setup']),
76 |     include_package_data=True,
77 |     zip_safe=True,
78 |     setup_requires=[
79 |         'pytest-runner',
80 |     ],
81 |     tests_require=[
82 |         'pytest',
83 |     ],
84 |     entry_points={
85 |         'console_scripts': [
86 |             'nutch-python = nutch.nutch:main'
87 |         ],
88 |     },
89 |     package_data = {
90 |         # And include any *.conf files found in the 'conf' subdirectory
91 |         # for the package
92 |     },
93 |     install_requires=[
94 |         'setuptools',
95 |         'requests'
96 |     ]
97 | )
98 | 


--------------------------------------------------------------------------------