*
190 | * ------------------------- */
191 |
192 | private void readPlainContent(URL url) throws IOException {
193 | String page = HttpWebClient.getHtmlPage(url.toString(), conf);
194 |
195 | content = page.getBytes("UTF-8");
196 | }
197 |
198 | private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
199 | readLine(in, line, false);
200 |
201 | int codeStart = line.indexOf(" ");
202 | int codeEnd = line.indexOf(" ", codeStart + 1);
203 |
204 | // handle lines with no plaintext result code, ie:
205 | // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
206 | if (codeEnd == -1)
207 | codeEnd = line.length();
208 |
209 | int code;
210 | try {
211 | code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
212 | } catch (NumberFormatException e) {
213 | throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
214 | }
215 |
216 | return code;
217 | }
218 |
219 | private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
220 |
221 | int colonIndex = line.indexOf(":"); // key is up to colon
222 | if (colonIndex == -1) {
223 | int i;
224 | for (i = 0; i < line.length(); i++)
225 | if (!Character.isWhitespace(line.charAt(i)))
226 | break;
227 | if (i == line.length())
228 | return;
229 | throw new HttpException("No colon in header:" + line);
230 | }
231 | String key = line.substring(0, colonIndex);
232 |
233 | int valueStart = colonIndex + 1; // skip whitespace
234 | while (valueStart < line.length()) {
235 | int c = line.charAt(valueStart);
236 | if (c != ' ' && c != '\t')
237 | break;
238 | valueStart++;
239 | }
240 | String value = line.substring(valueStart);
241 | headers.set(key, value);
242 | }
243 |
244 | // Adds headers to our headers Metadata
245 | private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
246 |
247 | while (readLine(in, line, true) != 0) {
248 |
249 | // handle HTTP responses with missing blank line after headers
250 | int pos;
251 | if (((pos = line.indexOf(" 0) {
286 | // at EOL -- check for continued line if the current
287 | // (possibly continued) line wasn't blank
288 | if (allowContinuedLine)
289 | switch (peek(in)) {
290 | case ' ':
291 | case '\t': // line is continued
292 | in.read();
293 | continue;
294 | }
295 | }
296 | return line.length(); // else complete
297 | default:
298 | line.append((char) c);
299 | }
300 | }
301 | throw new EOFException();
302 | }
303 |
304 | private static int peek(PushbackInputStream in) throws IOException {
305 | int value = in.read();
306 | in.unread(value);
307 | return value;
308 | }
309 | }
310 |
--------------------------------------------------------------------------------
/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Protocol plugin which supports retrieving documents via selenium.
4 |
5 |
6 |
--------------------------------------------------------------------------------
/protocol-selenium/src/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | groupId
8 | protocol-selenium
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
--------------------------------------------------------------------------------
/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Protocol plugin which supports retrieving documents via the htmlunit.
4 |
5 |
6 |
--------------------------------------------------------------------------------