├── .gitignore
├── LICENSE.txt
├── README.md
├── lib-selenium
    ├── build.xml
    ├── ivy.xml
    ├── plugin.xml
    └── src
    │   ├── java
    │       └── org
    │       │   └── apache
    │       │       └── nutch
    │       │           └── protocol
    │       │               └── selenium
    │       │                   └── HttpWebClient.java
    │   └── pom.xml
└── protocol-selenium
    ├── .idea
        ├── .name
        ├── compiler.xml
        ├── copyright
        │   └── profiles_settings.xml
        ├── encodings.xml
        ├── misc.xml
        ├── modules.xml
        ├── scopes
        │   └── scope_settings.xml
        ├── vcs.xml
        └── workspace.xml
    ├── build.xml
    ├── ivy.xml
    ├── plugin.xml
    └── src
        ├── java
            └── org
            │   └── apache
            │       └── nutch
            │           └── protocol
            │               └── selenium
            │                   ├── Http.java
            │                   ├── HttpResponse.java
            │                   └── package.html
        ├── pom.xml
        └── target
            └── classes
                └── org
                    └── apache
                        └── nutch
                            └── protocol
                                └── htmlunit
                                    └── package.html


/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | *.DS_Store
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Nutch Selenium
  2 | ==============
  3 | 
  4 | This plugin allows you to fetch javascript pages using Selenium, while relying on the rest of the awesome Nutch stack! This allows you to
  5 | 
  6 | A) Leverage Nutch, a world class web crawler
  7 | 
  8 | B) Not have to use some paid service just to perform large-scale javascript/ajax aware web crawls
  9 | 
 10 | C) Not have to wait another 2 years for Nutch to patch in either the [Ajax crawler hashbang workaround](https://issues.apache.org/jira/browse/NUTCH-1323) and then, not having to patch it to get the use case of ammending the original url with the hashbang-workaround's content.
 11 | 
 12 | The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient. I also have patches to send through on nutch-htmlunit which get it working with nutch 2.2.1, so stay tuned if you want to use htmlunit for some reason.
 13 | 
 14 | 
 15 | ## IMPORTANT NOTES:
 16 | 
 17 | ~~This plugin is currently being merged into the Nutch Core - see [issue #1933 on Nutch's JIRA](https://issues.apache.org/jira/browse/NUTCH-1933)~~
 18 | 
 19 | 1. This plugin is currently in the nutch core. See [lib-selenium](https://github.com/apache/nutch/tree/master/src/plugin/lib-selenium) and [protocol-selenium](https://github.com/apache/nutch/tree/master/src/plugin/protocol-selenium).
 20 | 
 21 | 2. As a result of #1, this plugin is unsupported on github. Please see the [Nutch JIRA](https://issues.apache.org/jira/browse/NUTCH/?selectedTab=com.atlassian.jira.jira-projects-plugin:summary-panel) for issues. 
 22 | 
 23 | ## Installation (tested on Ubuntu 14.0x)
 24 | 
 25 | Part 1: Setting up Selenium
 26 | 
 27 | A) Ensure that you have Firefox installed
 28 | ```
 29 | # More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
 30 | 
 31 | sudo apt-get install firefox
 32 | ```
 33 | B) Install Xvfb and its associates
 34 | ```
 35 | sudo apt-get install xorg synaptic xvfb gtk2-engines-pixbuf xfonts-cyrillic xfonts-100dpi \
 36 |     xfonts-75dpi xfonts-base xfonts-scalable freeglut3-dev dbus-x11 openbox x11-xserver-utils \
 37 |     libxrender1 cabextract
 38 | ```
 39 | C) Set a display for Xvfb, so that firefox believes a display is connected
 40 | ```
 41 | sudo /usr/bin/Xvfb :11 -screen 0 1024x768x24 &
 42 | sudo export DISPLAY=:11
 43 | ```
 44 | Part 2: Installing plugin for Nutch (where NUTCH_HOME is the root of your nutch install)
 45 | 
 46 | A) Add Selenium to your Nutch dependencies
 47 | ```
 48 | <!-- NUTCH_HOME/ivy/ivy.xml -->
 49 | 
 50 | <ivy-module version="1.0">
 51 |   <dependencies>
 52 |     ...
 53 |     <!-- begin selenium dependencies -->
 54 |     <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.42.2" />
 55 | 
 56 |     <dependency org="com.opera" name="operadriver" rev="1.5">
 57 |       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
 58 |     </dependency>
 59 |     <!-- end selenium dependencies -->
 60 |   </dependencies>
 61 | </ivy-module>
 62 | ```
 63 | B) Add the required plugins to your `NUTCH_HOME/src/plugin/build.xml`
 64 | ```
 65 | <!-- NUTCH_HOME/src/plugin/build.xml -->
 66 | 
 67 | <project name="Nutch" default="deploy-core" basedir=".">
 68 |   <!-- ====================================================== -->
 69 |   <!-- Build & deploy all the plugin jars.                    -->
 70 |   <!-- ====================================================== -->
 71 |   <target name="deploy">
 72 |     ... 
 73 |     <ant dir="lib-selenium" target="deploy"/>
 74 |     <ant dir="protocol-selenium" target="deploy" />
 75 |   </target>
 76 |       ...
 77 | </project>
 78 | ```
 79 | C) Ensure that the plugin will be used as the fetcher/initial parser in your config
 80 | ```
 81 | <!-- NUTCH_HOME/conf/nutch-site.xml -->
 82 | 
 83 | <configuration>
 84 |   ...
 85 |   <property>
 86 |     <name>plugin.includes</name>
 87 |     <value>protocol-selenium|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
 88 |     <description>Regular expression naming plugin directory names to
 89 |     include.  Any plugin not matching this expression is excluded.
 90 |     In any case you need at least include the nutch-extensionpoints plugin. By
 91 |     default Nutch includes crawling just HTML and plain text via HTTP,
 92 |     and basic indexing and search plugins. In order to use HTTPS please enable 
 93 |     protocol-httpclient, but be aware of possible intermittent problems with the 
 94 |     underlying commons-httpclient library.
 95 |     </description>
 96 |   </property>
 97 | ```
 98 | D) Add the plugin folders to your installation's `NUTCH_HOME/src/plugin` directory
 99 | 
100 | ![Nutch plugin directory](http://i.imgur.com/CzLqoqO.png)
101 | 
102 | E) Compile nutch
103 | ```
104 | ant runtime
105 | ```
106 | 
107 | F) Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above)
108 | ```
109 | NUTCH_HOME/runtime/local/bin/crawl /opt/apache-nutch-2.2.1/urls/ webpage $NUTCH_SOLR_SERVER $NUTCH_CRAWL_DEPTH
110 | ```
111 | 
112 | 


--------------------------------------------------------------------------------
/lib-selenium/build.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | <project name="lib-selenium" default="jar-core">
19 | 
20 |   <import file="../build-plugin.xml"/>
21 | 
22 |   <!-- Add compilation dependencies to classpath -->
23 |   <path id="plugin.deps">    
24 |     <fileset dir="${nutch.root}/build">
25 |       <include name="**/lib-http/*.jar" />
26 |     </fileset>
27 |   </path>
28 | </project>
29 | 


--------------------------------------------------------------------------------
/lib-selenium/ivy.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | 
 3 | <!--
 4 |    Licensed to the Apache Software Foundation (ASF) under one or more
 5 |    contributor license agreements.  See the NOTICE file distributed with
 6 |    this work for additional information regarding copyright ownership.
 7 |    The ASF licenses this file to You under the Apache License, Version 2.0
 8 |    (the "License"); you may not use this file except in compliance with
 9 |    the License.  You may obtain a copy of the License at
10 | 
11 |        http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 |    Unless required by applicable law or agreed to in writing, software
14 |    distributed under the License is distributed on an "AS IS" BASIS,
15 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |    See the License for the specific language governing permissions and
17 |    limitations under the License.
18 | -->
19 | 
20 | <ivy-module version="1.0">
21 |   <info organisation="org.apache.nutch" module="${ant.project.name}">
22 |     <license name="Apache 2.0"/>
23 |     <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
24 |     <description>
25 |         Apache Nutch
26 |     </description>
27 |   </info>
28 | 
29 |   <configurations>
30 |     <include file="../../..//ivy/ivy-configurations.xml"/>
31 |   </configurations>
32 | 
33 |   <publications>
34 |     <!--get the artifact from our module name-->
35 |     <artifact conf="master"/>
36 |   </publications>
37 | 
38 |   <dependencies>
39 |   </dependencies>
40 |   
41 | </ivy-module>
42 | 


--------------------------------------------------------------------------------
/lib-selenium/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | <!--
19 |  ! A common framework for http protocol implementations
20 |  !-->
21 | <plugin
22 |    id="lib-selenium"
23 |    name="HTTP Framework"
24 |    version="1.0"
25 |    provider-name="org.apache.nutch">
26 | 
27 |    <runtime>
28 |      <library name="lib-selenium.jar">
29 |         <export name="*"/>
30 |      </library>       
31 |    </runtime>
32 | 
33 |    <requires>
34 |    </requires>
35 | </plugin>
36 | 


--------------------------------------------------------------------------------
/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java:
--------------------------------------------------------------------------------
 1 | package org.apache.nutch.protocol.selenium;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.slf4j.Logger;
 5 | import org.slf4j.LoggerFactory;
 6 | 
 7 | import org.openqa.selenium.By;
 8 | import org.openqa.selenium.WebDriver;
 9 | import org.openqa.selenium.WebElement;
10 | import org.openqa.selenium.firefox.FirefoxDriver;
11 | import org.openqa.selenium.firefox.FirefoxProfile;
12 | import org.openqa.selenium.firefox.internal.ProfilesIni;
13 | import org.openqa.selenium.support.ui.ExpectedCondition;
14 | import org.openqa.selenium.support.ui.WebDriverWait;
15 | 
16 | import java.lang.String;
17 | 
18 | public class HttpWebClient {
19 | 
20 |     private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");
21 | 
22 |     public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
23 | 
24 |         @Override
25 |         protected WebDriver initialValue()
26 |         {
27 |             FirefoxProfile profile = new FirefoxProfile();
28 |             profile.setPreference("permissions.default.stylesheet", 2);
29 |             profile.setPreference("permissions.default.image", 2);
30 |             profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
31 |             WebDriver driver = new FirefoxDriver(profile);
32 |             return driver;
33 |         };
34 |     };
35 | 
36 |     public static String getHtmlPage(String url, Configuration conf) {
37 |         WebDriver driver = null;
38 | 
39 |         try {
40 |             driver = new FirefoxDriver();
41 | //            }            WebDriver driver = threadWebDriver.get();
42 | //            if (driver == null) {
43 | //                driver = new FirefoxDriver();
44 | //            }
45 | 
46 |             driver.get(url);
47 | 
48 |             // Wait for the page to load, timeout after 3 seconds
49 |             new WebDriverWait(driver, 3);
50 | 
51 |             String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
52 | 
53 |             return innerHtml;
54 | 
55 |             // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
56 |         } catch (Exception e) {
57 |             throw new RuntimeException(e);
58 |         } finally {
59 |             if (driver != null) try { driver.quit(); } catch (Exception e) { throw new RuntimeException(e); }
60 |         }
61 |     };
62 | 
63 |     public static String getHtmlPage(String url) {
64 |         return getHtmlPage(url, null);
65 |     }
66 | }


--------------------------------------------------------------------------------
/lib-selenium/src/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>groupId</groupId>
 8 |     <artifactId>lib-selenium</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 | 
11 |     
12 | </project>


--------------------------------------------------------------------------------
/protocol-selenium/.idea/.name:
--------------------------------------------------------------------------------
1 | protocol-htmlunit


--------------------------------------------------------------------------------
/protocol-selenium/.idea/compiler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="CompilerConfiguration">
 4 |     <option name="DEFAULT_COMPILER" value="Javac" />
 5 |     <resourceExtensions />
 6 |     <wildcardResourcePatterns>
 7 |       <entry name="!?*.java" />
 8 |       <entry name="!?*.form" />
 9 |       <entry name="!?*.class" />
10 |       <entry name="!?*.groovy" />
11 |       <entry name="!?*.scala" />
12 |       <entry name="!?*.flex" />
13 |       <entry name="!?*.kt" />
14 |       <entry name="!?*.clj" />
15 |     </wildcardResourcePatterns>
16 |     <annotationProcessing>
17 |       <profile default="true" name="Default" enabled="false">
18 |         <processorPath useClasspath="true" />
19 |       </profile>
20 |     </annotationProcessing>
21 |   </component>
22 | </project>
23 | 
24 | 


--------------------------------------------------------------------------------
/protocol-selenium/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="CopyrightManager">
2 |   <settings default="" />
3 | </component>


--------------------------------------------------------------------------------
/protocol-selenium/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
4 | </project>
5 | 
6 | 


--------------------------------------------------------------------------------
/protocol-selenium/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="EntryPointsManager">
 4 |     <entry_points version="2.0" />
 5 |   </component>
 6 |   <component name="IdProvider" IDEtalkID="6971D7619CBA52D864847E67A9C8B4FB" />
 7 |   <component name="IvyIDEA.ProjectSettings">
 8 |     <option name="artifactTypeSettings">
 9 |       <ArtifactTypeSettings />
10 |     </option>
11 |     <option name="propertiesSettings">
12 |       <PropertiesSettings />
13 |     </option>
14 |   </component>
15 |   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.7" project-jdk-type="JavaSDK">
16 |     <output url="file://$PROJECT_DIR$/out" />
17 |   </component>
18 | </project>
19 | 
20 | 


--------------------------------------------------------------------------------
/protocol-selenium/.idea/modules.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectModuleManager">
 4 |     <modules>
 5 |       <module fileurl="file://$PROJECT_DIR$/src/protocol-htmlunit.iml" filepath="$PROJECT_DIR$/src/protocol-htmlunit.iml" />
 6 |     </modules>
 7 |   </component>
 8 | </project>
 9 | 
10 | 


--------------------------------------------------------------------------------
/protocol-selenium/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="DependencyValidationManager">
2 |   <state>
3 |     <option name="SKIP_IMPORT_STATEMENTS" value="false" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/protocol-selenium/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="" />
5 |   </component>
6 | </project>
7 | 
8 | 


--------------------------------------------------------------------------------
/protocol-selenium/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="2b4da0da-dce4-4b67-ac20-96dc2f31fec8" name="Default" comment="" />
  5 |     <ignored path="protocol-htmlunit.iws" />
  6 |     <ignored path=".idea/workspace.xml" />
  7 |     <option name="TRACKING_ENABLED" value="true" />
  8 |     <option name="SHOW_DIALOG" value="false" />
  9 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 10 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 11 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 12 |   </component>
 13 |   <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
 14 |   <component name="CreatePatchCommitExecutor">
 15 |     <option name="PATCH_PATH" value="" />
 16 |   </component>
 17 |   <component name="DaemonCodeAnalyzer">
 18 |     <disable_hints />
 19 |   </component>
 20 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 21 |   <component name="FavoritesManager">
 22 |     <favorites_list name="protocol-htmlunit" />
 23 |   </component>
 24 |   <component name="FileEditorManager">
 25 |     <leaf>
 26 |       <file leaf-file-name="Http.java" pinned="false" current="true" current-in-tab="true">
 27 |         <entry file="file://$PROJECT_DIR$/src/java/org/apache/nutch/protocol/htmlunit/Http.java">
 28 |           <provider selected="true" editor-type-id="text-editor">
 29 |             <state vertical-scroll-proportion="0.76945525" vertical-offset="19" max-vertical-offset="1050">
 30 |               <caret line="56" column="0" selection-start-line="56" selection-start-column="0" selection-end-line="56" selection-end-column="0" />
 31 |               <folding>
 32 |                 <element signature="imports" expanded="true" />
 33 |               </folding>
 34 |             </state>
 35 |           </provider>
 36 |         </entry>
 37 |       </file>
 38 |       <file leaf-file-name="HttpResponse.java" pinned="false" current="false" current-in-tab="false">
 39 |         <entry file="file://$PROJECT_DIR$/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java">
 40 |           <provider selected="true" editor-type-id="text-editor">
 41 |             <state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="4635">
 42 |               <caret line="20" column="40" selection-start-line="20" selection-start-column="40" selection-end-line="20" selection-end-column="40" />
 43 |               <folding />
 44 |             </state>
 45 |           </provider>
 46 |         </entry>
 47 |       </file>
 48 |     </leaf>
 49 |   </component>
 50 |   <component name="FindManager">
 51 |     <FindUsagesManager>
 52 |       <setting name="OPEN_NEW_TAB" value="true" />
 53 |     </FindUsagesManager>
 54 |   </component>
 55 |   <component name="ProjectFrameBounds">
 56 |     <option name="x" value="275" />
 57 |     <option name="y" value="22" />
 58 |     <option name="width" value="1645" />
 59 |     <option name="height" value="1174" />
 60 |   </component>
 61 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
 62 |     <OptionsSetting value="true" id="Add" />
 63 |     <OptionsSetting value="true" id="Remove" />
 64 |     <OptionsSetting value="true" id="Checkout" />
 65 |     <OptionsSetting value="true" id="Update" />
 66 |     <OptionsSetting value="true" id="Status" />
 67 |     <OptionsSetting value="true" id="Edit" />
 68 |     <OptionsSetting value="true" id="Undo Check Out" />
 69 |     <OptionsSetting value="true" id="Get Latest Version" />
 70 |     <ConfirmationsSetting value="0" id="Add" />
 71 |     <ConfirmationsSetting value="0" id="Remove" />
 72 |   </component>
 73 |   <component name="ProjectReloadState">
 74 |     <option name="STATE" value="0" />
 75 |   </component>
 76 |   <component name="ProjectView">
 77 |     <navigator currentView="ProjectPane" proportions="" version="1">
 78 |       <flattenPackages />
 79 |       <showMembers />
 80 |       <showModules />
 81 |       <showLibraryContents />
 82 |       <hideEmptyPackages />
 83 |       <abbreviatePackageNames />
 84 |       <autoscrollToSource />
 85 |       <autoscrollFromSource />
 86 |       <sortByType />
 87 |     </navigator>
 88 |     <panes>
 89 |       <pane id="Scope" />
 90 |       <pane id="PackagesPane" />
 91 |       <pane id="ProjectPane">
 92 |         <subPane>
 93 |           <PATH>
 94 |             <PATH_ELEMENT>
 95 |               <option name="myItemId" value="protocol-htmlunit" />
 96 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
 97 |             </PATH_ELEMENT>
 98 |             <PATH_ELEMENT>
 99 |               <option name="myItemId" value="External Libraries" />
100 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ExternalLibrariesNode" />
101 |             </PATH_ELEMENT>
102 |           </PATH>
103 |           <PATH>
104 |             <PATH_ELEMENT>
105 |               <option name="myItemId" value="protocol-htmlunit" />
106 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
107 |             </PATH_ELEMENT>
108 |           </PATH>
109 |           <PATH>
110 |             <PATH_ELEMENT>
111 |               <option name="myItemId" value="protocol-htmlunit" />
112 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
113 |             </PATH_ELEMENT>
114 |             <PATH_ELEMENT>
115 |               <option name="myItemId" value="src" />
116 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
117 |             </PATH_ELEMENT>
118 |           </PATH>
119 |           <PATH>
120 |             <PATH_ELEMENT>
121 |               <option name="myItemId" value="protocol-htmlunit" />
122 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
123 |             </PATH_ELEMENT>
124 |             <PATH_ELEMENT>
125 |               <option name="myItemId" value="src" />
126 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
127 |             </PATH_ELEMENT>
128 |             <PATH_ELEMENT>
129 |               <option name="myItemId" value="src" />
130 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
131 |             </PATH_ELEMENT>
132 |           </PATH>
133 |           <PATH>
134 |             <PATH_ELEMENT>
135 |               <option name="myItemId" value="protocol-htmlunit" />
136 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
137 |             </PATH_ELEMENT>
138 |             <PATH_ELEMENT>
139 |               <option name="myItemId" value="src" />
140 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
141 |             </PATH_ELEMENT>
142 |             <PATH_ELEMENT>
143 |               <option name="myItemId" value="java" />
144 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
145 |             </PATH_ELEMENT>
146 |             <PATH_ELEMENT>
147 |               <option name="myItemId" value="htmlunit" />
148 |               <option name="myItemType" value="com.android.tools.idea.gradle.projectView.AndroidPsiDirectoryNode" />
149 |             </PATH_ELEMENT>
150 |           </PATH>
151 |         </subPane>
152 |       </pane>
153 |     </panes>
154 |   </component>
155 |   <component name="PropertiesComponent">
156 |     <property name="GoToClass.includeLibraries" value="false" />
157 |     <property name="GoToClass.toSaveIncludeLibraries" value="false" />
158 |     <property name="GoToFile.includeJavaFiles" value="false" />
159 |     <property name="MemberChooser.sorted" value="false" />
160 |     <property name="MemberChooser.showClasses" value="true" />
161 |     <property name="MemberChooser.copyJavadoc" value="false" />
162 |     <property name="FullScreen" value="false" />
163 |     <property name="WebServerToolWindowFactoryState" value="false" />
164 |     <property name="last_opened_file_path" value="$PROJECT_DIR$/../../../runtime/local/plugins/urlnormalizer-regex" />
165 |     <property name="project.structure.last.edited" value="Modules" />
166 |     <property name="project.structure.proportion" value="0.15" />
167 |     <property name="project.structure.side.proportion" value="0.2" />
168 |   </component>
169 |   <component name="RunManager">
170 |     <configuration default="true" type="#org.jetbrains.idea.devkit.run.PluginConfigurationType" factoryName="Plugin">
171 |       <module name="" />
172 |       <option name="VM_PARAMETERS" value="-Xmx512m -Xms256m -XX:MaxPermSize=250m -ea" />
173 |       <option name="PROGRAM_PARAMETERS" />
174 |       <method />
175 |     </configuration>
176 |     <configuration default="true" type="GrailsRunConfigurationType" factoryName="Grails">
177 |       <module name="" />
178 |       <setting name="vmparams" value="" />
179 |       <setting name="cmdLine" value="run-app" />
180 |       <setting name="depsClasspath" value="false" />
181 |       <setting name="passParentEnv" value="true" />
182 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
183 |       <setting name="launchBrowser" value="false" />
184 |       <method />
185 |     </configuration>
186 |     <configuration default="true" type="Remote" factoryName="Remote">
187 |       <option name="USE_SOCKET_TRANSPORT" value="true" />
188 |       <option name="SERVER_MODE" value="false" />
189 |       <option name="SHMEM_ADDRESS" value="javadebug" />
190 |       <option name="HOST" value="localhost" />
191 |       <option name="PORT" value="5005" />
192 |       <method />
193 |     </configuration>
194 |     <configuration default="true" type="JavascriptDebugType" factoryName="JavaScript Debug">
195 |       <method />
196 |     </configuration>
197 |     <configuration default="true" type="Applet" factoryName="Applet">
198 |       <module name="" />
199 |       <option name="MAIN_CLASS_NAME" />
200 |       <option name="HTML_FILE_NAME" />
201 |       <option name="HTML_USED" value="false" />
202 |       <option name="WIDTH" value="400" />
203 |       <option name="HEIGHT" value="300" />
204 |       <option name="POLICY_FILE" value="$APPLICATION_HOME_DIR$/bin/appletviewer.policy" />
205 |       <option name="VM_PARAMETERS" />
206 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
207 |       <option name="ALTERNATIVE_JRE_PATH" />
208 |       <method />
209 |     </configuration>
210 |     <configuration default="true" type="TestNG" factoryName="TestNG">
211 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
212 |       <module name="" />
213 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
214 |       <option name="ALTERNATIVE_JRE_PATH" />
215 |       <option name="SUITE_NAME" />
216 |       <option name="PACKAGE_NAME" />
217 |       <option name="MAIN_CLASS_NAME" />
218 |       <option name="METHOD_NAME" />
219 |       <option name="GROUP_NAME" />
220 |       <option name="TEST_OBJECT" value="CLASS" />
221 |       <option name="VM_PARAMETERS" value="-ea" />
222 |       <option name="PARAMETERS" />
223 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
224 |       <option name="OUTPUT_DIRECTORY" />
225 |       <option name="ANNOTATION_TYPE" />
226 |       <option name="ENV_VARIABLES" />
227 |       <option name="PASS_PARENT_ENVS" value="true" />
228 |       <option name="TEST_SEARCH_SCOPE">
229 |         <value defaultName="moduleWithDependencies" />
230 |       </option>
231 |       <option name="USE_DEFAULT_REPORTERS" value="false" />
232 |       <option name="PROPERTIES_FILE" />
233 |       <envs />
234 |       <properties />
235 |       <listeners />
236 |       <method />
237 |     </configuration>
238 |     <configuration default="true" type="JUnit" factoryName="JUnit">
239 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
240 |       <module name="" />
241 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
242 |       <option name="ALTERNATIVE_JRE_PATH" />
243 |       <option name="PACKAGE_NAME" />
244 |       <option name="MAIN_CLASS_NAME" />
245 |       <option name="METHOD_NAME" />
246 |       <option name="TEST_OBJECT" value="class" />
247 |       <option name="VM_PARAMETERS" value="-ea" />
248 |       <option name="PARAMETERS" />
249 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
250 |       <option name="ENV_VARIABLES" />
251 |       <option name="PASS_PARENT_ENVS" value="true" />
252 |       <option name="TEST_SEARCH_SCOPE">
253 |         <value defaultName="moduleWithDependencies" />
254 |       </option>
255 |       <envs />
256 |       <patterns />
257 |       <method />
258 |     </configuration>
259 |     <configuration default="true" type="FlashRunConfigurationType" factoryName="Flash App">
260 |       <option name="BCName" value="" />
261 |       <option name="IOSSimulatorSdkPath" value="" />
262 |       <option name="adlOptions" value="" />
263 |       <option name="airProgramParameters" value="" />
264 |       <option name="appDescriptorForEmulator" value="Android" />
265 |       <option name="debugTransport" value="USB" />
266 |       <option name="debuggerSdkRaw" value="BC SDK" />
267 |       <option name="emulator" value="NexusOne" />
268 |       <option name="emulatorAdlOptions" value="" />
269 |       <option name="fastPackaging" value="true" />
270 |       <option name="fullScreenHeight" value="0" />
271 |       <option name="fullScreenWidth" value="0" />
272 |       <option name="launchUrl" value="false" />
273 |       <option name="launcherParameters">
274 |         <LauncherParameters>
275 |           <option name="browser" value="a7bb68e0-33c0-4d6f-a81a-aac1fdb870c8" />
276 |           <option name="launcherType" value="OSDefault" />
277 |           <option name="newPlayerInstance" value="false" />
278 |           <option name="playerPath" value="/Applications/Flash Player Debugger.app" />
279 |         </LauncherParameters>
280 |       </option>
281 |       <option name="mobileRunTarget" value="Emulator" />
282 |       <option name="moduleName" value="" />
283 |       <option name="overriddenMainClass" value="" />
284 |       <option name="overriddenOutputFileName" value="" />
285 |       <option name="overrideMainClass" value="false" />
286 |       <option name="runTrusted" value="true" />
287 |       <option name="screenDpi" value="0" />
288 |       <option name="screenHeight" value="0" />
289 |       <option name="screenWidth" value="0" />
290 |       <option name="url" value="http://" />
291 |       <option name="usbDebugPort" value="7936" />
292 |       <method />
293 |     </configuration>
294 |     <configuration default="true" type="AndroidTestRunConfigurationType" factoryName="Android Tests">
295 |       <module name="" />
296 |       <option name="TESTING_TYPE" value="0" />
297 |       <option name="INSTRUMENTATION_RUNNER_CLASS" value="" />
298 |       <option name="METHOD_NAME" value="" />
299 |       <option name="CLASS_NAME" value="" />
300 |       <option name="PACKAGE_NAME" value="" />
301 |       <option name="TARGET_SELECTION_MODE" value="EMULATOR" />
302 |       <option name="USE_LAST_SELECTED_DEVICE" value="false" />
303 |       <option name="PREFERRED_AVD" value="" />
304 |       <option name="USE_COMMAND_LINE" value="true" />
305 |       <option name="COMMAND_LINE" value="" />
306 |       <option name="WIPE_USER_DATA" value="false" />
307 |       <option name="DISABLE_BOOT_ANIMATION" value="false" />
308 |       <option name="NETWORK_SPEED" value="full" />
309 |       <option name="NETWORK_LATENCY" value="none" />
310 |       <option name="CLEAR_LOGCAT" value="false" />
311 |       <option name="SHOW_LOGCAT_AUTOMATICALLY" value="true" />
312 |       <option name="FILTER_LOGCAT_AUTOMATICALLY" value="true" />
313 |       <method />
314 |     </configuration>
315 |     <configuration default="true" type="FlexUnitRunConfigurationType" factoryName="FlexUnit" appDescriptorForEmulator="Android" class_name="" emulatorAdlOptions="" method_name="" package_name="" scope="Class">
316 |       <option name="BCName" value="" />
317 |       <option name="launcherParameters">
318 |         <LauncherParameters>
319 |           <option name="browser" value="a7bb68e0-33c0-4d6f-a81a-aac1fdb870c8" />
320 |           <option name="launcherType" value="OSDefault" />
321 |           <option name="newPlayerInstance" value="false" />
322 |           <option name="playerPath" value="/Applications/Flash Player Debugger.app" />
323 |         </LauncherParameters>
324 |       </option>
325 |       <option name="moduleName" value="" />
326 |       <option name="trusted" value="true" />
327 |       <method />
328 |     </configuration>
329 |     <configuration default="true" type="CucumberJavaRunConfigurationType" factoryName="Cucumber java">
330 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
331 |       <option name="myFilePath" />
332 |       <option name="GLUE" />
333 |       <option name="myNameFilter" />
334 |       <option name="myGeneratedName" />
335 |       <option name="MAIN_CLASS_NAME" />
336 |       <option name="VM_PARAMETERS" />
337 |       <option name="PROGRAM_PARAMETERS" />
338 |       <option name="WORKING_DIRECTORY" />
339 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
340 |       <option name="ALTERNATIVE_JRE_PATH" />
341 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
342 |       <option name="ENV_VARIABLES" />
343 |       <option name="PASS_PARENT_ENVS" value="true" />
344 |       <module name="" />
345 |       <envs />
346 |       <method />
347 |     </configuration>
348 |     <configuration default="true" type="Application" factoryName="Application">
349 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
350 |       <option name="MAIN_CLASS_NAME" />
351 |       <option name="VM_PARAMETERS" />
352 |       <option name="PROGRAM_PARAMETERS" />
353 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
354 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
355 |       <option name="ALTERNATIVE_JRE_PATH" />
356 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
357 |       <option name="ENV_VARIABLES" />
358 |       <option name="PASS_PARENT_ENVS" value="true" />
359 |       <module name="" />
360 |       <envs />
361 |       <method />
362 |     </configuration>
363 |     <configuration default="true" type="AndroidRunConfigurationType" factoryName="Android Application">
364 |       <module name="" />
365 |       <option name="ACTIVITY_CLASS" value="" />
366 |       <option name="MODE" value="default_activity" />
367 |       <option name="DEPLOY" value="true" />
368 |       <option name="ARTIFACT_NAME" value="" />
369 |       <option name="TARGET_SELECTION_MODE" value="EMULATOR" />
370 |       <option name="USE_LAST_SELECTED_DEVICE" value="false" />
371 |       <option name="PREFERRED_AVD" value="" />
372 |       <option name="USE_COMMAND_LINE" value="true" />
373 |       <option name="COMMAND_LINE" value="" />
374 |       <option name="WIPE_USER_DATA" value="false" />
375 |       <option name="DISABLE_BOOT_ANIMATION" value="false" />
376 |       <option name="NETWORK_SPEED" value="full" />
377 |       <option name="NETWORK_LATENCY" value="none" />
378 |       <option name="CLEAR_LOGCAT" value="false" />
379 |       <option name="SHOW_LOGCAT_AUTOMATICALLY" value="true" />
380 |       <option name="FILTER_LOGCAT_AUTOMATICALLY" value="true" />
381 |       <method />
382 |     </configuration>
383 |     <list size="0" />
384 |     <configuration name="&lt;template&gt;" type="WebApp" default="true" selected="false">
385 |       <Host>localhost</Host>
386 |       <Port>5050</Port>
387 |     </configuration>
388 |   </component>
389 |   <component name="ShelveChangesManager" show_recycled="false" />
390 |   <component name="SvnConfiguration">
391 |     <configuration />
392 |   </component>
393 |   <component name="TaskManager">
394 |     <task active="true" id="Default" summary="Default task">
395 |       <changelist id="2b4da0da-dce4-4b67-ac20-96dc2f31fec8" name="Default" comment="" />
396 |       <created>1402934991106</created>
397 |       <updated>1402934991106</updated>
398 |       <workItem from="1402934997233" duration="2799000" />
399 |     </task>
400 |     <servers />
401 |   </component>
402 |   <component name="TimeTrackingManager">
403 |     <option name="totallyTimeSpent" value="2799000" />
404 |   </component>
405 |   <component name="ToolWindowManager">
406 |     <frame x="275" y="22" width="1645" height="1174" extended-state="0" />
407 |     <editor active="false" />
408 |     <layout>
409 |       <window_info id="Palette&#9;" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
410 |       <window_info id="Changes" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
411 |       <window_info id="Designer" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
412 |       <window_info id="Palette" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
413 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
414 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
415 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
416 |       <window_info id="Java Enterprise" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
417 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
418 |       <window_info id="IDEtalk Messages" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
419 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
420 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
421 |       <window_info id="IDEtalk" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
422 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
423 |       <window_info id="IvyIDEA" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.32986766" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
424 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
425 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
426 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="SLIDING" type="SLIDING" visible="false" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
427 |       <window_info id="Maven Projects" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
428 |       <window_info id="Application Servers" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
429 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" weight="0.20336868" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
430 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
431 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
432 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
433 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
434 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
435 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
436 |     </layout>
437 |   </component>
438 |   <component name="Vcs.Log.UiProperties">
439 |     <option name="RECENTLY_FILTERED_USER_GROUPS">
440 |       <collection />
441 |     </option>
442 |     <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
443 |       <collection />
444 |     </option>
445 |   </component>
446 |   <component name="VcsContentAnnotationSettings">
447 |     <option name="myLimit" value="2678400000" />
448 |   </component>
449 |   <component name="VcsManagerConfiguration">
450 |     <option name="myTodoPanelSettings">
451 |       <TodoPanelSettings />
452 |     </option>
453 |   </component>
454 |   <component name="XDebuggerManager">
455 |     <breakpoint-manager />
456 |   </component>
457 |   <component name="editorHistoryManager">
458 |     <entry file="file://$PROJECT_DIR$/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java">
459 |       <provider selected="true" editor-type-id="text-editor">
460 |         <state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="4635">
461 |           <caret line="20" column="40" selection-start-line="20" selection-start-column="40" selection-end-line="20" selection-end-column="40" />
462 |           <folding />
463 |         </state>
464 |       </provider>
465 |     </entry>
466 |     <entry file="file://$PROJECT_DIR$/src/java/org/apache/nutch/protocol/htmlunit/Http.java">
467 |       <provider selected="true" editor-type-id="text-editor">
468 |         <state vertical-scroll-proportion="0.76945525" vertical-offset="19" max-vertical-offset="1050">
469 |           <caret line="56" column="0" selection-start-line="56" selection-start-column="0" selection-end-line="56" selection-end-column="0" />
470 |           <folding>
471 |             <element signature="imports" expanded="true" />
472 |           </folding>
473 |         </state>
474 |       </provider>
475 |     </entry>
476 |   </component>
477 |   <component name="masterDetails">
478 |     <states>
479 |       <state key="ArtifactsStructureConfigurable.UI">
480 |         <settings>
481 |           <artifact-editor />
482 |           <splitter-proportions>
483 |             <option name="proportions">
484 |               <list>
485 |                 <option value="0.2" />
486 |               </list>
487 |             </option>
488 |           </splitter-proportions>
489 |         </settings>
490 |       </state>
491 |       <state key="FacetStructureConfigurable.UI">
492 |         <settings>
493 |           <last-edited>No facets are configured</last-edited>
494 |           <splitter-proportions>
495 |             <option name="proportions">
496 |               <list>
497 |                 <option value="0.2" />
498 |               </list>
499 |             </option>
500 |           </splitter-proportions>
501 |         </settings>
502 |       </state>
503 |       <state key="GlobalLibrariesConfigurable.UI">
504 |         <settings>
505 |           <splitter-proportions>
506 |             <option name="proportions">
507 |               <list>
508 |                 <option value="0.2" />
509 |               </list>
510 |             </option>
511 |           </splitter-proportions>
512 |         </settings>
513 |       </state>
514 |       <state key="JdkListConfigurable.UI">
515 |         <settings>
516 |           <last-edited>1.7</last-edited>
517 |           <splitter-proportions>
518 |             <option name="proportions">
519 |               <list>
520 |                 <option value="0.2" />
521 |               </list>
522 |             </option>
523 |           </splitter-proportions>
524 |         </settings>
525 |       </state>
526 |       <state key="ModuleStructureConfigurable.UI">
527 |         <settings>
528 |           <last-edited>protocol-htmlunit</last-edited>
529 |           <splitter-proportions>
530 |             <option name="proportions">
531 |               <list>
532 |                 <option value="0.2" />
533 |               </list>
534 |             </option>
535 |           </splitter-proportions>
536 |         </settings>
537 |       </state>
538 |       <state key="ProjectJDKs.UI">
539 |         <settings>
540 |           <last-edited>1.7</last-edited>
541 |           <splitter-proportions>
542 |             <option name="proportions">
543 |               <list>
544 |                 <option value="0.2" />
545 |               </list>
546 |             </option>
547 |           </splitter-proportions>
548 |         </settings>
549 |       </state>
550 |       <state key="ProjectLibrariesConfigurable.UI">
551 |         <settings>
552 |           <splitter-proportions>
553 |             <option name="proportions">
554 |               <list>
555 |                 <option value="0.2" />
556 |               </list>
557 |             </option>
558 |           </splitter-proportions>
559 |         </settings>
560 |       </state>
561 |     </states>
562 |   </component>
563 | </project>
564 | 
565 | 


--------------------------------------------------------------------------------
/protocol-selenium/build.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | <project name="protocol-selenium" default="jar-core">
19 | 
20 |   <import file="../build-plugin.xml"/>
21 | 
22 |   <!-- Build compilation dependencies -->
23 |   <target name="deps-jar">
24 |     <ant target="jar" inheritall="false" dir="../lib-http"/>
25 |     <ant target="jar" inheritall="false" dir="../lib-selenium"/>
26 |   </target>
27 | 
28 |   <!-- Add compilation dependencies to classpath -->
29 |   <path id="plugin.deps">
30 |     <fileset dir="${nutch.root}/build">
31 |       <include name="**/lib-http/*.jar" />
32 |       <include name="**/lib-selenium/*.jar" />
33 |     </fileset>
34 |   </path>
35 | 
36 | </project>
37 | 


--------------------------------------------------------------------------------
/protocol-selenium/ivy.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | 
 3 | <!--
 4 |    Licensed to the Apache Software Foundation (ASF) under one or more
 5 |    contributor license agreements.  See the NOTICE file distributed with
 6 |    this work for additional information regarding copyright ownership.
 7 |    The ASF licenses this file to You under the Apache License, Version 2.0
 8 |    (the "License"); you may not use this file except in compliance with
 9 |    the License.  You may obtain a copy of the License at
10 | 
11 |        http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 |    Unless required by applicable law or agreed to in writing, software
14 |    distributed under the License is distributed on an "AS IS" BASIS,
15 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |    See the License for the specific language governing permissions and
17 |    limitations under the License.
18 | -->
19 | 
20 | <ivy-module version="1.0">
21 |   <info organisation="org.apache.nutch" module="${ant.project.name}">
22 |     <license name="Apache 2.0"/>
23 |     <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
24 |     <description>
25 |         Apache Nutch
26 |     </description>
27 |   </info>
28 | 
29 |   <configurations>
30 |     <include file="../../..//ivy/ivy-configurations.xml"/>
31 |   </configurations>
32 | 
33 |   <publications>
34 |     <!--get the artifact from our module name-->
35 |     <artifact conf="default"/>
36 |   </publications>
37 | 
38 |   <dependencies>
39 |   </dependencies>
40 |   
41 | </ivy-module>
42 | 


--------------------------------------------------------------------------------
/protocol-selenium/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | <plugin
19 |    id="protocol-selenium"
20 |    name="Http Protocol Plug-in"
21 |    version="1.0.0"
22 |    provider-name="nutch.org">
23 | 
24 |    <runtime>
25 |       <library name="protocol-selenium.jar">
26 |          <export name="*"/>
27 |       </library>
28 |    </runtime>
29 | 
30 |    <requires>
31 |       <import plugin="nutch-extensionpoints"/>
32 |       <import plugin="lib-http"/>
33 |       <import plugin="lib-selenium"/>
34 |    </requires>
35 | 
36 |    <extension id="org.apache.nutch.protocol.selenium"
37 |               name="HttpProtocol"
38 |               point="org.apache.nutch.protocol.Protocol">
39 | 
40 |       <implementation id="org.apache.nutch.protocol.selenium.Http"
41 |                       class="org.apache.nutch.protocol.selenium.Http">
42 |         <parameter name="protocolName" value="http"/>
43 |       </implementation>
44 | 
45 |    </extension>
46 | 
47 | </plugin>
48 | 


--------------------------------------------------------------------------------
/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java:
--------------------------------------------------------------------------------
 1 | package org.apache.nutch.protocol.selenium;
 2 | 
 3 | // JDK imports
 4 | import java.io.IOException;
 5 | import java.net.URL;
 6 | import java.util.Collection;
 7 | import java.util.HashSet;
 8 | 
 9 | import org.apache.hadoop.conf.Configuration;
10 | import org.apache.nutch.net.protocols.Response;
11 | import org.apache.nutch.protocol.http.api.HttpBase;
12 | import org.apache.nutch.protocol.ProtocolException;
13 | import org.apache.nutch.util.NutchConfiguration;
14 | import org.apache.nutch.storage.WebPage;
15 | import org.apache.nutch.storage.WebPage.Field;
16 | 
17 | import org.apache.nutch.protocol.selenium.HttpResponse;
18 | 
19 | import org.slf4j.Logger;
20 | import org.slf4j.LoggerFactory;
21 | 
22 | public class Http extends HttpBase {
23 | 
24 |   public static final Logger LOG = LoggerFactory.getLogger(Http.class);
25 | 
26 |   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
27 | 
28 |   static {
29 |     FIELDS.add(WebPage.Field.MODIFIED_TIME);
30 |     FIELDS.add(WebPage.Field.HEADERS);
31 |   }
32 | 
33 |   public Http() {
34 |     super(LOG);
35 |   }
36 | 
37 |   @Override
38 |   public void setConf(Configuration conf) {
39 |     super.setConf(conf);
40 |   }
41 | 
42 |   public static void main(String[] args) throws Exception {
43 |     Http http = new Http();
44 |     http.setConf(NutchConfiguration.create());
45 |     main(http, args);
46 |   }
47 | 
48 |     @Override
49 |     protected Response getResponse(URL url, WebPage page, boolean redirect)
50 |             throws ProtocolException, IOException {
51 |         return new HttpResponse(this, url, page, getConf());
52 |     }
53 | 
54 |   @Override
55 |   public Collection<WebPage.Field> getFields() {
56 |     return FIELDS;
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java:
--------------------------------------------------------------------------------
  1 | package org.apache.nutch.protocol.selenium;
  2 | 
  3 | // JDK imports
  4 | import java.io.BufferedInputStream;
  5 | import java.io.EOFException;
  6 | import java.io.IOException;
  7 | import java.io.OutputStream;
  8 | import java.io.PushbackInputStream;
  9 | import java.net.InetSocketAddress;
 10 | import java.net.Socket;
 11 | import java.net.URL;
 12 | 
 13 | import org.apache.commons.lang.StringUtils;
 14 | import org.apache.hadoop.conf.Configuration;
 15 | // import org.apache.nutch.crawl.CrawlDatum;
 16 | import org.apache.nutch.storage.WebPage;
 17 | import org.apache.nutch.metadata.Metadata;
 18 | import org.apache.nutch.metadata.SpellCheckedMetadata;
 19 | import org.apache.nutch.net.protocols.HttpDateFormat;
 20 | import org.apache.nutch.net.protocols.Response;
 21 | import org.apache.nutch.protocol.ProtocolException;
 22 | import org.apache.nutch.protocol.http.api.HttpBase;
 23 | import org.apache.nutch.protocol.http.api.HttpException;
 24 | 
 25 | import org.openqa.selenium.By;
 26 | import org.openqa.selenium.WebDriver;
 27 | import org.openqa.selenium.WebElement;
 28 | import org.openqa.selenium.firefox.FirefoxDriver;
 29 | import org.openqa.selenium.support.ui.ExpectedCondition;
 30 | import org.openqa.selenium.support.ui.WebDriverWait;
 31 | 
 32 | /* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
 33 | 
 34 | public class HttpResponse implements Response {
 35 | 
 36 |     private Http http;
 37 |     private URL url;
 38 |     private String orig;
 39 |     private String base;
 40 |     private byte[] content;
 41 |     private int code;
 42 |     private Metadata headers = new SpellCheckedMetadata();
 43 |     
 44 |     /** The nutch configuration */
 45 |     private Configuration conf = null;
 46 |     
 47 |     public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws ProtocolException, IOException {
 48 | 
 49 |         this.conf = conf;
 50 |         this.http = http;
 51 |         this.url = url;
 52 |         this.orig = url.toString();
 53 |         this.base = url.toString();
 54 | 
 55 |         if (!"http".equals(url.getProtocol()))
 56 |             throw new HttpException("Not an HTTP url:" + url);
 57 | 
 58 |         if (Http.LOG.isTraceEnabled()) {
 59 |             Http.LOG.trace("fetching " + url);
 60 |         }
 61 | 
 62 |         String path = "".equals(url.getFile()) ? "/" : url.getFile();
 63 | 
 64 |         // some servers will redirect a request with a host line like
 65 |         // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
 66 |         // don't want the :80...
 67 | 
 68 |         String host = url.getHost();
 69 |         int port;
 70 |         String portString;
 71 |         if (url.getPort() == -1) {
 72 |             port = 80;
 73 |             portString = "";
 74 |         } else {
 75 |             port = url.getPort();
 76 |             portString = ":" + port;
 77 |         }
 78 |         Socket socket = null;
 79 | 
 80 |         try {
 81 |             socket = new Socket(); // create the socket
 82 |             socket.setSoTimeout(http.getTimeout());
 83 | 
 84 |             // connect
 85 |             String sockHost = http.useProxy() ? http.getProxyHost() : host;
 86 |             int sockPort = http.useProxy() ? http.getProxyPort() : port;
 87 |             InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
 88 |             socket.connect(sockAddr, http.getTimeout());
 89 | 
 90 |             // make request
 91 |             OutputStream req = socket.getOutputStream();
 92 | 
 93 |             StringBuffer reqStr = new StringBuffer("GET ");
 94 |             if (http.useProxy()) {
 95 |                 reqStr.append(url.getProtocol() + "://" + host + portString + path);
 96 |             } else {
 97 |                 reqStr.append(path);
 98 |             }
 99 | 
100 |             reqStr.append(" HTTP/1.0\r\n");
101 | 
102 |             reqStr.append("Host: ");
103 |             reqStr.append(host);
104 |             reqStr.append(portString);
105 |             reqStr.append("\r\n");
106 | 
107 |             reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
108 | 
109 |             String userAgent = http.getUserAgent();
110 |             if ((userAgent == null) || (userAgent.length() == 0)) {
111 |                 if (Http.LOG.isErrorEnabled()) {
112 |                     Http.LOG.error("User-agent is not set!");
113 |                 }
114 |             } else {
115 |                 reqStr.append("User-Agent: ");
116 |                 reqStr.append(userAgent);
117 |                 reqStr.append("\r\n");
118 |             }
119 | 
120 |             reqStr.append("Accept-Language: ");
121 |             reqStr.append(this.http.getAcceptLanguage());
122 |             reqStr.append("\r\n");
123 | 
124 |             reqStr.append("Accept: ");
125 |             reqStr.append(this.http.getAccept());
126 |             reqStr.append("\r\n");
127 | 
128 |             if (page.getModifiedTime() > 0) {
129 |                 reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(page.getModifiedTime()));
130 |                 reqStr.append("\r\n");
131 |             }
132 |             reqStr.append("\r\n");
133 | 
134 |             byte[] reqBytes = reqStr.toString().getBytes();
135 | 
136 |             req.write(reqBytes);
137 |             req.flush();
138 | 
139 |             PushbackInputStream in = // process response
140 |             new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
141 |                     Http.BUFFER_SIZE);
142 | 
143 |             StringBuffer line = new StringBuffer();
144 | 
145 |             boolean haveSeenNonContinueStatus = false;
146 |             while (!haveSeenNonContinueStatus) {
147 |                 // parse status code line
148 |                 this.code = parseStatusLine(in, line);
149 |                 // parse headers
150 |                 parseHeaders(in, line);
151 |                 haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
152 |             }
153 | 
154 |             //readPlainContent(in);
155 |             readPlainContent(url);
156 | 
157 |         } finally {
158 |             if (socket != null)
159 |                 socket.close();
160 |         }
161 | 
162 |     }
163 | 
164 |     /* ------------------------- *
165 |      * <implementation:Response> *
166 |      * ------------------------- */
167 | 
168 |     public URL getUrl() {
169 |         return url;
170 |     }
171 | 
172 |     public int getCode() {
173 |         return code;
174 |     }
175 | 
176 |     public String getHeader(String name) {
177 |         return headers.get(name);
178 |     }
179 | 
180 |     public Metadata getHeaders() {
181 |         return headers;
182 |     }
183 | 
184 |     public byte[] getContent() {
185 |         return content;
186 |     }
187 | 
188 |     /* ------------------------- *
189 |      * <implementation:Response> *
190 |      * ------------------------- */
191 | 
192 |     private void readPlainContent(URL url) throws IOException {
193 |         String page = HttpWebClient.getHtmlPage(url.toString(), conf);
194 | 
195 |         content = page.getBytes("UTF-8");
196 |     }
197 | 
198 |     private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
199 |         readLine(in, line, false);
200 | 
201 |         int codeStart = line.indexOf(" ");
202 |         int codeEnd = line.indexOf(" ", codeStart + 1);
203 | 
204 |         // handle lines with no plaintext result code, ie:
205 |         // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
206 |         if (codeEnd == -1)
207 |             codeEnd = line.length();
208 | 
209 |         int code;
210 |         try {
211 |             code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
212 |         } catch (NumberFormatException e) {
213 |             throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
214 |         }
215 | 
216 |         return code;
217 |     }
218 | 
219 |     private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
220 | 
221 |         int colonIndex = line.indexOf(":"); // key is up to colon
222 |         if (colonIndex == -1) {
223 |             int i;
224 |             for (i = 0; i < line.length(); i++)
225 |                 if (!Character.isWhitespace(line.charAt(i)))
226 |                     break;
227 |             if (i == line.length())
228 |                 return;
229 |             throw new HttpException("No colon in header:" + line);
230 |         }
231 |         String key = line.substring(0, colonIndex);
232 | 
233 |         int valueStart = colonIndex + 1; // skip whitespace
234 |         while (valueStart < line.length()) {
235 |             int c = line.charAt(valueStart);
236 |             if (c != ' ' && c != '\t')
237 |                 break;
238 |             valueStart++;
239 |         }
240 |         String value = line.substring(valueStart);
241 |         headers.set(key, value);
242 |     }
243 | 
244 |     // Adds headers to our headers Metadata
245 |     private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
246 | 
247 |         while (readLine(in, line, true) != 0) {
248 | 
249 |             // handle HTTP responses with missing blank line after headers
250 |             int pos;
251 |             if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
252 |                     || ((pos = line.indexOf("<html")) != -1)) {
253 | 
254 |                 in.unread(line.substring(pos).getBytes("UTF-8"));
255 |                 line.setLength(pos);
256 | 
257 |                 try {
258 |                     //TODO: (CM) We don't know the header names here
259 |                     //since we're just handling them generically. It would
260 |                     //be nice to provide some sort of mapping function here
261 |                     //for the returned header names to the standard metadata
262 |                     //names in the ParseData class
263 |                     processHeaderLine(line);
264 |                 } catch (Exception e) {
265 |                     // fixme:
266 |                     Http.LOG.warn("Error: ", e);
267 |                 }
268 |                 return;
269 |             }
270 | 
271 |             processHeaderLine(line);
272 |         }
273 |     }
274 | 
275 |     private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
276 |             throws IOException {
277 |         line.setLength(0);
278 |         for (int c = in.read(); c != -1; c = in.read()) {
279 |             switch (c) {
280 |             case '\r':
281 |                 if (peek(in) == '\n') {
282 |                     in.read();
283 |                 }
284 |             case '\n':
285 |                 if (line.length() > 0) {
286 |                     // at EOL -- check for continued line if the current
287 |                     // (possibly continued) line wasn't blank
288 |                     if (allowContinuedLine)
289 |                         switch (peek(in)) {
290 |                         case ' ':
291 |                         case '\t': // line is continued
292 |                             in.read();
293 |                             continue;
294 |                         }
295 |                 }
296 |                 return line.length(); // else complete
297 |             default:
298 |                 line.append((char) c);
299 |             }
300 |         }
301 |         throw new EOFException();
302 |     }
303 | 
304 |     private static int peek(PushbackInputStream in) throws IOException {
305 |         int value = in.read();
306 |         in.unread(value);
307 |         return value;
308 |     }
309 | }
310 | 


--------------------------------------------------------------------------------
/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <body>
3 | <p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
4 | </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/protocol-selenium/src/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>groupId</groupId>
 8 |     <artifactId>protocol-selenium</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 | 
11 |     
12 | </project>


--------------------------------------------------------------------------------
/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <body>
3 | <p>Protocol plugin which supports retrieving documents via the htmlunit.</p><p></p>
4 | </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------