This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.
70 | *
71 | *
Examples
72 | *
There are numerous examples on the website. Please check them out to get more context on how scraping works.
73 | *
74 | *
Example 1
75 | *
Here would be an example.
76 | *
77 | *
Example 2
78 | *
Here would be the second example.
79 | *
80 | *
Example 3
81 | *
Here would be another example.
82 | *
83 | *
84 | *
85 | */
86 | $web->go('https://test-pages.phpscraper.de/content/outline.html');
87 |
88 | // Get the content outline
89 | $this->assertSame(
90 | [
91 | [
92 | 'tag' => 'h1',
93 | 'content' => 'We are testing here!',
94 | ], [
95 | 'tag' => 'p',
96 | 'content' => 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
97 | ], [
98 | 'tag' => 'h2',
99 | 'content' => 'Examples',
100 | ], [
101 | 'tag' => 'p',
102 | 'content' => 'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
103 | ], [
104 | 'tag' => 'h3',
105 | 'content' => 'Example 1',
106 | ], [
107 | 'tag' => 'p',
108 | 'content' => 'Here would be an example.',
109 | ], [
110 | 'tag' => 'h3',
111 | 'content' => 'Example 2',
112 | ], [
113 | 'tag' => 'p',
114 | 'content' => 'Here would be the second example.',
115 | ], [
116 | 'tag' => 'h3',
117 | 'content' => 'Example 3',
118 | ], [
119 | 'tag' => 'p',
120 | 'content' => 'Here would be another example.',
121 | ], [
122 | 'tag' => 'p',
123 | 'content' => '',
124 | ],
125 | ],
126 | $web->outlineWithParagraphs
127 | );
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/tests/ParagraphsTest.php:
--------------------------------------------------------------------------------
1 | We are testing here!
18 | *
This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.
19 | *
20 | *
Examples
21 | *
There are numerous examples on the website. Please check them out to get more context on how scraping works.
22 | *
23 | *
Example 1
24 | *
Here would be an example.
25 | *
26 | *
Example 2
27 | *
Here would be the second example.
28 | *
29 | *
Example 3
30 | *
Here would be another example.
31 | *
32 | *
33 | *
34 | */
35 | $web->go('https://test-pages.phpscraper.de/content/outline.html');
36 |
37 | // Get the paragraphs
38 | $this->assertSame([
39 | 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
40 | 'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
41 | 'Here would be an example.',
42 | 'Here would be the second example.',
43 | 'Here would be another example.',
44 | '',
45 | ], $web->paragraphs);
46 | }
47 |
48 | /**
49 | * @test
50 | */
51 | public function cleanParagraphTest()
52 | {
53 | $web = new \Spekulatius\PHPScraper\PHPScraper;
54 |
55 | /**
56 | * Navigate to the test page. This page contains:
57 | *
58 | *
We are testing here!
59 | *
This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.
60 | *
61 | *
Examples
62 | *
There are numerous examples on the website. Please check them out to get more context on how scraping works.
63 | *
64 | *
Example 1
65 | *
Here would be an example.
66 | *
67 | *
Example 2
68 | *
Here would be the second example.
69 | *
70 | *
Example 3
71 | *
Here would be another example.
72 | *
73 | *
74 | *
75 | */
76 | $web->go('https://test-pages.phpscraper.de/content/outline.html');
77 |
78 | // Get the cleaned up paragraphs
79 | $this->assertSame([
80 | 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
81 | 'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
82 | 'Here would be an example.',
83 | 'Here would be the second example.',
84 | 'Here would be another example.',
85 | ], $web->cleanParagraphs);
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/tests/ParserCsvTest.php:
--------------------------------------------------------------------------------
1 | parseCsv();
19 | } catch (\Exception $e) {
20 | // Did we get the expected exception?
21 | $this->assertSame(
22 | 'You can not call parseCsv() without parameter or initial navigation.',
23 | $e->getMessage()
24 | );
25 | }
26 |
27 | // This tests ensures an exception is thrown, if no context is given.
28 | // Context means either it's been navigated before (URL context) or get something to (fetch +) parse
29 | try {
30 | $web = new \Spekulatius\PHPScraper\PHPScraper;
31 | $web->parseCsvWithHeader();
32 | } catch (\Exception $e) {
33 | // Did we get the expected exception?
34 | $this->assertSame(
35 | 'You can not call parseCsvWithHeader() without parameter or initial navigation.',
36 | $e->getMessage()
37 | );
38 | }
39 | }
40 |
41 | /**
42 | * @test
43 | */
44 | public function testCsvDecodeRaw()
45 | {
46 | $web = new \Spekulatius\PHPScraper\PHPScraper;
47 |
48 | // Only decoding
49 | $this->assertSame(
50 | [
51 | ['date', 'value'],
52 | ['1945-02-06', '4.20'],
53 | ['1952-03-11', '42'],
54 | ],
55 | $web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
56 | );
57 |
58 | // Fetching and decoding
59 | $this->assertSame(
60 | [
61 | ['date', 'value'],
62 | ['1945-02-06', '4.20'],
63 | ['1952-03-11', '42'],
64 | ],
65 | $web->csvDecodeRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
66 | );
67 | }
68 |
69 | /**
70 | * @test
71 | */
72 | public function testCsvDecode()
73 | {
74 | $web = new \Spekulatius\PHPScraper\PHPScraper;
75 |
76 | // Only decoding
77 | $this->assertSame(
78 | [
79 | ['date', 'value'],
80 | ['1945-02-06', 4.20],
81 | ['1952-03-11', 42],
82 | ],
83 | $web->csvDecode("date,value\n1945-02-06,4.20\n1952-03-11,42"),
84 | );
85 |
86 | // Fetching and decoding
87 | $this->assertSame(
88 | [
89 | ['date', 'value'],
90 | ['1945-02-06', 4.20],
91 | ['1952-03-11', 42],
92 | ],
93 | $web->csvDecode($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
94 | );
95 | }
96 |
97 | /**
98 | * Test with pipe as separator, enclosure and escape.
99 | *
100 | * @test
101 | */
102 | public function testCsvDecodeAndCustomEncoding()
103 | {
104 | $web = new \Spekulatius\PHPScraper\PHPScraper;
105 |
106 | $this->assertSame(
107 | [
108 | ['date', 'value'],
109 | ['1945-02-06', 4.20],
110 | ['1952-03-11', 42],
111 | ['\\'],
112 | ],
113 | $web->csvDecode(
114 | "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\",
115 | '|',
116 | '"',
117 | '\\'
118 | )
119 | );
120 | }
121 |
122 | /**
123 | * @test
124 | */
125 | public function testCsvDecodeWithHeaderRaw()
126 | {
127 | $web = new \Spekulatius\PHPScraper\PHPScraper;
128 |
129 | // Only decoding
130 | $this->assertSame(
131 | [
132 | ['date' => '1945-02-06', 'value' => '4.20'],
133 | ['date' => '1952-03-11', 'value' => '42'],
134 | ],
135 | $web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
136 | );
137 |
138 | // Fetching and decoding
139 | $this->assertSame(
140 | [
141 | ['date' => '1945-02-06', 'value' => '4.20'],
142 | ['date' => '1952-03-11', 'value' => '42'],
143 | ],
144 | $web->csvDecodeWithHeaderRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
145 | );
146 | }
147 |
148 | /**
149 | * @test
150 | */
151 | public function testCsvDecodeWithHeaderAndCasting()
152 | {
153 | $web = new \Spekulatius\PHPScraper\PHPScraper;
154 |
155 | $this->assertSame(
156 | [
157 | ['date' => '1945-02-06', 'value' => 4.20],
158 | ['date' => '1952-03-11', 'value' => 42],
159 | ],
160 | $web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"),
161 | );
162 | }
163 |
164 | /**
165 | * Test with header, pipe as separator, and enclosure.
166 | *
167 | * @test
168 | */
169 | public function testCsvDecodeWithHeaderAndCustomEncoding()
170 | {
171 | $web = new \Spekulatius\PHPScraper\PHPScraper;
172 |
173 | $this->assertSame(
174 | [
175 | ['date' => '1945-02-06', 'value' => 4.20],
176 | ['date' => '1952-03-11', 'value' => 42],
177 | ],
178 |
179 | $web->csvDecodeWithHeader(
180 | "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"",
181 | '|',
182 | '"',
183 | '\\'
184 | )
185 | );
186 | }
187 |
188 | /**
189 | * Check the pluming: Test the various ways to call `parseCsv()`.
190 | *
191 | * @test
192 | */
193 | public function testDifferentCsvCalls()
194 | {
195 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
196 | $web = new \Spekulatius\PHPScraper\PHPScraper;
197 |
198 | // For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
199 | $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
200 | $csvData = [['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42]];
201 |
202 | // Case 1: Passing in an CSV string in.
203 | $this->assertSame(
204 | // Pass the CSV Data as reference in.
205 | $csvData,
206 |
207 | // Parse the $csvString directly.
208 | (new \Spekulatius\PHPScraper\PHPScraper)
209 | ->parseCsv($csvString)
210 | );
211 |
212 | // Case 2: `go` + `parseCsv()`
213 | $this->assertSame(
214 | // Pass the CSV Data as reference in.
215 | $csvData,
216 |
217 | // Chained call using a CSV file as URL.
218 | (new \Spekulatius\PHPScraper\PHPScraper)
219 | ->go('https://test-pages.phpscraper.de/test.csv')
220 | ->parseCsv()
221 | );
222 |
223 | // Case 3: `parseCsv()` with absolute URL.
224 | $this->assertSame(
225 | // Pass the CSV Data as reference in.
226 | $csvData,
227 |
228 | // Pass the absolutely URL to `parseCsv()`
229 | (new \Spekulatius\PHPScraper\PHPScraper)
230 | ->parseCsv('https://test-pages.phpscraper.de/test.csv')
231 | );
232 |
233 | // Case 4: `go` + `parseCsv()` with relative URL.
234 | $this->assertSame(
235 | // Pass the CSV Data as reference in.
236 | $csvData,
237 |
238 | // The 'go' sets the base URL for the following relative path.
239 | (new \Spekulatius\PHPScraper\PHPScraper)
240 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
241 | ->parseCsv('/test.csv')
242 | );
243 |
244 | // Case 5: `go` with base URL + `go` with relative URL + `parseCsv()`.
245 | // 5.1. Ensure the final URL is correct.
246 | $this->assertSame(
247 | 'https://test-pages.phpscraper.de/test.csv',
248 |
249 | // The first 'go' sets the base URL for the following `go` with relative URL.
250 | (new \Spekulatius\PHPScraper\PHPScraper)
251 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
252 | ->go('/test.csv')
253 | ->currentUrl()
254 | );
255 |
256 | // 5.2. Ensure the parsed CSV is correct.
257 | $this->assertSame(
258 | // Pass the CSV Data as reference in.
259 | $csvData,
260 |
261 | // The first 'go' sets the base URL for the following `go` with relative URL.
262 | (new \Spekulatius\PHPScraper\PHPScraper)
263 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
264 | ->go('/test.csv')
265 | ->parseCsv()
266 | );
267 |
268 | // Case 6: With encoding params
269 | $this->assertSame(
270 | // Pass the CSV Data as reference in.
271 | $csvData,
272 |
273 | // The first 'go' sets the base URL for the following `go` with relative URL.
274 | (new \Spekulatius\PHPScraper\PHPScraper)
275 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
276 | ->go('/test-custom.csv')
277 | ->parseCsv(null, '|', '"')
278 | );
279 |
280 | // Case 7: With encoding params and (relative) URL
281 | $this->assertSame(
282 | // Pass the CSV Data as reference in.
283 | $csvData,
284 |
285 | // The first 'go' sets the base URL for the following `go` with relative URL.
286 | (new \Spekulatius\PHPScraper\PHPScraper)
287 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
288 | ->parseCsv('/test-custom.csv', '|', '"')
289 | );
290 | }
291 |
292 | /**
293 | * Check the pluming: Test the various ways to call `parseCsvWithHeader()`.
294 | *
295 | * @test
296 | */
297 | public function testDifferentCsvWithHeaderCalls()
298 | {
299 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
300 | $web = new \Spekulatius\PHPScraper\PHPScraper;
301 |
302 | // For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
303 | $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
304 | $csvData = [
305 | ['date' => '1945-02-06', 'value' => 4.20],
306 | ['date' => '1952-03-11', 'value' => 42],
307 | ];
308 |
309 | // Case 1: Passing in an CSV string in.
310 | $this->assertSame(
311 | // Pass the CSV Data as reference in.
312 | $csvData,
313 |
314 | // Parse the $csvString directly.
315 | (new \Spekulatius\PHPScraper\PHPScraper)
316 | ->parseCsvWithHeader($csvString)
317 | );
318 |
319 | // Case 2: `parseCsvWithHeader()`
320 | $this->assertSame(
321 | // Pass the CSV Data as reference in.
322 | $csvData,
323 |
324 | // Chained call using a CSV file as URL.
325 | (new \Spekulatius\PHPScraper\PHPScraper)
326 | ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
327 | );
328 |
329 | // Case 2: `go` + `parseCsvWithHeader()`
330 | $this->assertSame(
331 | // Pass the CSV Data as reference in.
332 | $csvData,
333 |
334 | // Chained call using a CSV file as URL.
335 | (new \Spekulatius\PHPScraper\PHPScraper)
336 | ->go('https://test-pages.phpscraper.de/test.csv')
337 | ->parseCsvWithHeader()
338 | );
339 |
340 | // Case 3: `parseCsvWithHeader()` with absolute URL.
341 | $this->assertSame(
342 | // Pass the CSV Data as reference in.
343 | $csvData,
344 |
345 | // Pass the absolutely URL to `parseCsvWithHeader()`
346 | (new \Spekulatius\PHPScraper\PHPScraper)
347 | ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
348 | );
349 |
350 | // Case 4: `go` + `parseCsvWithHeader()` with relative URL.
351 | $this->assertSame(
352 | // Pass the CSV Data as reference in.
353 | $csvData,
354 |
355 | // The 'go' sets the base URL for the following relative path.
356 | (new \Spekulatius\PHPScraper\PHPScraper)
357 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
358 | ->parseCsvWithHeader('/test.csv')
359 | );
360 |
361 | // Case 5: `go` with base URL + `go` with relative URL + `parseCsvWithHeader()`.
362 | // 5.1. Ensure the final URL is correct.
363 | $this->assertSame(
364 | 'https://test-pages.phpscraper.de/test.csv',
365 |
366 | // The first 'go' sets the base URL for the following `go` with relative URL.
367 | (new \Spekulatius\PHPScraper\PHPScraper)
368 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
369 | ->go('/test.csv')
370 | ->currentUrl()
371 | );
372 |
373 | // 5.2. Ensure the parsed CSV is correct.
374 | $this->assertSame(
375 | // Pass the CSV Data as reference in.
376 | $csvData,
377 |
378 | // The first 'go' sets the base URL for the following `go` with relative URL.
379 | (new \Spekulatius\PHPScraper\PHPScraper)
380 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
381 | ->go('/test.csv')
382 | ->parseCsvWithHeader()
383 | );
384 |
385 | // Case 6: With encoding params
386 | $this->assertSame(
387 | // Pass the CSV Data as reference in.
388 | $csvData,
389 |
390 | // The first 'go' sets the base URL for the following `go` with relative URL.
391 | (new \Spekulatius\PHPScraper\PHPScraper)
392 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
393 | ->go('/test-custom.csv')
394 | ->parseCsvWithHeader(null, '|', '"')
395 | );
396 |
397 | // Case 7: With encoding params and (relative) URL
398 | $this->assertSame(
399 | // Pass the CSV Data as reference in.
400 | $csvData,
401 |
402 | // The first 'go' sets the base URL for the following `go` with relative URL.
403 | (new \Spekulatius\PHPScraper\PHPScraper)
404 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
405 | ->parseCsvWithHeader('/test-custom.csv', '|', '"')
406 | );
407 | }
408 | }
409 |
--------------------------------------------------------------------------------
/tests/ParserJsonTest.php:
--------------------------------------------------------------------------------
1 | parseJson();
19 | } catch (\Exception $e) {
20 | // Did we get the expected exception?
21 | $this->assertSame(
22 | 'You can not call parseJson() without parameter or initial navigation.',
23 | $e->getMessage()
24 | );
25 | }
26 | }
27 |
28 | /**
29 | * Test the various ways to call `parseJson()`.
30 | *
31 | * @test
32 | */
33 | public function testDifferentJsonCalls()
34 | {
35 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
36 | $web = new \Spekulatius\PHPScraper\PHPScraper;
37 |
38 | // For the reference we are using a simple JSON and parse it.
39 | $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json');
40 | $jsonData = json_decode($jsonString, true);
41 |
42 | // Case 1: Passing in an JSON string in.
43 | $this->assertSame(
44 | // Pass the JSON Data as reference in.
45 | $jsonData,
46 |
47 | // Parse the $jsonString directly.
48 | (new \Spekulatius\PHPScraper\PHPScraper)
49 | ->parseJson($jsonString)
50 | );
51 |
52 | // Case 2: `go` + `parseJson()`
53 | $this->assertSame(
54 | // Pass the JSON Data as reference in.
55 | $jsonData,
56 |
57 | // Chained call using a JSON file as URL.
58 | (new \Spekulatius\PHPScraper\PHPScraper)
59 | ->go('https://test-pages.phpscraper.de/index.json')
60 | ->parseJson()
61 | );
62 |
63 | // Case 3: `parseJson()` with absolute URL.
64 | $this->assertSame(
65 | // Pass the JSON Data as reference in.
66 | $jsonData,
67 |
68 | // Pass the absolutely URL to `parseJson()`
69 | (new \Spekulatius\PHPScraper\PHPScraper)
70 | ->parseJson('https://test-pages.phpscraper.de/index.json')
71 | );
72 |
73 | // Case 4: `go` + `parseJson()` with relative URL.
74 | $this->assertSame(
75 | // Pass the JSON Data as reference in.
76 | $jsonData,
77 |
78 | // The 'go' sets the base URL for the following relative path.
79 | (new \Spekulatius\PHPScraper\PHPScraper)
80 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
81 | ->parseJson('/index.json')
82 | );
83 |
84 | // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`.
85 | // 5.1. Ensure the final URL is correct.
86 | $this->assertSame(
87 | 'https://test-pages.phpscraper.de/index.json',
88 |
89 | // The first 'go' sets the base URL for the following `go` with relative URL.
90 | (new \Spekulatius\PHPScraper\PHPScraper)
91 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
92 | ->go('/index.json')
93 | ->currentUrl()
94 | );
95 |
96 | // 5.2. Ensure the parsed JSON is correct.
97 | $this->assertSame(
98 | // Pass the JSON Data as reference in.
99 | $jsonData,
100 |
101 | // The first 'go' sets the base URL for the following `go` with relative URL.
102 | (new \Spekulatius\PHPScraper\PHPScraper)
103 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
104 | ->go('/index.json')
105 | ->parseJson()
106 | );
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/tests/ParserXmlTest.php:
--------------------------------------------------------------------------------
1 | parseXml();
19 | } catch (\Exception $e) {
20 | // Did we get the expected exception?
21 | $this->assertSame(
22 | 'You can not call parseXml() without parameter or initial navigation.',
23 | $e->getMessage()
24 | );
25 | }
26 | }
27 |
28 | /**
29 | * @test
30 | */
31 | public function testDifferentXmlCalls()
32 | {
33 | // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
34 | $web = new \Spekulatius\PHPScraper\PHPScraper;
35 |
36 | // For the reference we are using a simple XML and parse it.
37 | $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml');
38 | $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA);
39 | $xmlData = json_decode((string) json_encode($xml), true);
40 |
41 | // Case 1: Passing in an XML string in.
42 | $this->assertSame(
43 | // Pass the XML Data as reference in.
44 | $xmlData,
45 |
46 | // Parse the XML string directly.
47 | (new \Spekulatius\PHPScraper\PHPScraper)
48 | ->parseXml($xmlString)
49 | );
50 |
51 | // Case 2: `go` + `parseXml()`
52 | $this->assertSame(
53 | // Pass the XML Data as reference in.
54 | $xmlData,
55 |
56 | // Chained call with XML as URL
57 | (new \Spekulatius\PHPScraper\PHPScraper)
58 | ->go('https://test-pages.phpscraper.de/sitemap.xml')
59 | ->parseXml()
60 | );
61 |
62 | // Case 3: `parseXml()` with absolute URL.
63 | $this->assertSame(
64 | // Pass the XML Data as reference in.
65 | $xmlData,
66 |
67 | // Pass the absolutely URL to `parseXml()`
68 | (new \Spekulatius\PHPScraper\PHPScraper)
69 | ->parseXml('https://test-pages.phpscraper.de/sitemap.xml')
70 | );
71 |
72 | // Case 4: `go` + `parseXml()` with relative URL.
73 | $this->assertSame(
74 | // Pass the XML Data as reference in.
75 | $xmlData,
76 |
77 | // The 'go' sets the base URL for the following relative path.
78 | (new \Spekulatius\PHPScraper\PHPScraper)
79 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
80 | ->parseXml('/sitemap.xml')
81 | );
82 |
83 | // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`.
84 | // 5.1. Ensure the final URL is correct.
85 | $this->assertSame(
86 | 'https://test-pages.phpscraper.de/sitemap.xml',
87 |
88 | // The first 'go' sets the base URL for the following `go` with relative URL.
89 | (new \Spekulatius\PHPScraper\PHPScraper)
90 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
91 | ->go('/sitemap.xml')
92 | ->currentUrl()
93 | );
94 |
95 | // 5.2. Ensure the parsed JSON is correct.
96 | $this->assertSame(
97 | // Pass the XML Data as reference in.
98 | $xmlData,
99 |
100 | // The first 'go' sets the base URL for the following `go` with relative URL.
101 | (new \Spekulatius\PHPScraper\PHPScraper)
102 | ->go('https://test-pages.phpscraper.de/meta/feeds.html')
103 | ->go('/sitemap.xml')
104 | ->parseXml()
105 | );
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/tests/RedirectTest.php:
--------------------------------------------------------------------------------
1 | go('https://test-pages.phpscraper.de');
16 |
17 | $this->assertNotSame(
18 | $web->currentUrl,
19 | 'https://test-pages.phpscraper.de/'
20 | );
21 | $this->assertSame(
22 | $web->currentUrl,
23 | 'https://phpscraper.de/'
24 | );
25 | }
26 |
27 | /**
28 | * @test
29 | */
30 | public function testDisabledRedirect()
31 | {
32 | $web = new \Spekulatius\PHPScraper\PHPScraper;
33 |
34 | $web->setConfig([
35 | 'follow_redirects' => false,
36 | 'follow_meta_refresh' => false,
37 | 'max_redirects' => -1,
38 | ]);
39 |
40 | // Navigate to the test page: This redirects to phpscraper.de
41 | $web->go('https://test-pages.phpscraper.de');
42 |
43 | $this->assertSame(
44 | 'https://test-pages.phpscraper.de',
45 | $web->currentUrl,
46 | );
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/tests/TitleTest.php:
--------------------------------------------------------------------------------
1 | go('https://test-pages.phpscraper.de/meta/missing.html');
16 |
17 | // Check the title as not given (null)
18 | $this->assertNull($web->title);
19 | }
20 |
21 | /**
22 | * @test
23 | */
24 | public function testWithHTMLEntity()
25 | {
26 | $web = new \Spekulatius\PHPScraper\PHPScraper;
27 |
28 | // Navigate to the test page.
29 | $web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
30 |
31 | // Check the title
32 | $this->assertSame(
33 | 'Cat & Mouse',
34 | $web->title
35 | );
36 | }
37 |
38 | /**
39 | * @test
40 | */
41 | public function testLoremIpsum()
42 | {
43 | $web = new \Spekulatius\PHPScraper\PHPScraper;
44 |
45 | // Navigate to the test page.
46 | $web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
47 |
48 | // Check the title
49 | $this->assertSame(
50 | 'Lorem Ipsum',
51 | $web->title
52 | );
53 | }
54 |
55 | /**
56 | * @test
57 | */
58 | public function testGermanUmlaute()
59 | {
60 | $web = new \Spekulatius\PHPScraper\PHPScraper;
61 |
62 | // Navigate to the test page.
63 | $web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
64 |
65 | // Check the title
66 | $this->assertSame(
67 | 'A page with plenty of German umlaute everywhere (ä ü ö)',
68 | $web->title
69 | );
70 | }
71 |
72 | /**
73 | * @test
74 | */
75 | public function testChineseCharacters()
76 | {
77 | $web = new \Spekulatius\PHPScraper\PHPScraper;
78 |
79 | // Navigate to the test page.
80 | $web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
81 |
82 | // Check the title
83 | $this->assertSame(
84 | 'Page with Chinese Characters all over the place (加油)',
85 | $web->title
86 | );
87 | }
88 |
89 | /**
90 | * @test
91 | */
92 | public function testLongTitle()
93 | {
94 | $web = new \Spekulatius\PHPScraper\PHPScraper;
95 |
96 | // Navigate to the test page.
97 | $web->go('https://test-pages.phpscraper.de/title/long-title.html');
98 |
99 | // Check the title
100 | $this->assertSame(
101 | 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed mollis purus id ex consectetur facilisis. In gravida sodales nisl a consequat. Aenean ipsum sem, congue et rhoncus a, feugiat eget enim. Duis ut malesuada neque. Nam justo est, interdum eu massa in, volutpat vestibulum libero. Mauris a varius mauris, in vulputate ligula. Nulla rhoncus eget purus a sodales. Nulla facilisi. Proin purus purus, sodales non dolor in, lobortis elementum augue. Nulla sagittis, ex eu placerat varius, nulla mi rutrum odio, sit amet lacinia ipsum urna nec massa. Quisque posuere mauris id condimentum viverra.',
102 | $web->title
103 | );
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/tests/TwitterCardTest.php:
--------------------------------------------------------------------------------
1 | go('https://test-pages.phpscraper.de/meta/missing.html');
16 |
17 | // Empty array, because there aren't any twitter cards props set.
18 | $this->assertTrue(is_iterable($web->twitterCard));
19 | $this->assertTrue(empty($web->twitterCard));
20 | }
21 |
22 | /**
23 | * @test
24 | */
25 | public function testTwitterCard()
26 | {
27 | $web = new \Spekulatius\PHPScraper\PHPScraper;
28 |
29 | // Navigate to the test page.
30 | $web->go('https://test-pages.phpscraper.de/twittercard/example.html');
31 |
32 | // Check elements
33 | $this->assertSame('summary_large_image', $web->twitterCard['twitter:card']);
34 | $this->assertSame('Lorem Ipsum', $web->twitterCard['twitter:title']);
35 |
36 | // The whole set.
37 | $this->assertSame(
38 | [
39 | 'twitter:card' => 'summary_large_image',
40 | 'twitter:title' => 'Lorem Ipsum',
41 | 'twitter:description' => 'Lorem ipsum dolor etc.',
42 | 'twitter:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
43 | 'twitter:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
44 | ],
45 | $web->twitterCard
46 | );
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/tests/UrlTest.php:
--------------------------------------------------------------------------------
1 | assertNull($web->makeUrlAbsolute(null));
20 | }
21 |
22 | /**
23 | * @test
24 | */
25 | public function validateUriTest()
26 | {
27 | $web = new \Spekulatius\PHPScraper\PHPScraper;
28 |
29 | // We use any URL for this.
30 | $web->go('https://test-pages.phpscraper.de/content/lists.html');
31 |
32 | // Ensure the URL is set correctly.
33 | $this->assertSame(
34 | 'https://test-pages.phpscraper.de/content/lists.html',
35 | $web->currentUrl
36 | );
37 |
38 | // Ensure the host is parsed correctly.
39 | $this->assertSame(
40 | 'test-pages.phpscraper.de',
41 | $web->currentHost
42 | );
43 |
44 | // Ensure the host with protocol is parsed correctly.
45 | $this->assertSame(
46 | 'https://test-pages.phpscraper.de',
47 | $web->currentBaseHost
48 | );
49 | }
50 |
51 | /**
52 | * @test
53 | */
54 | public function testCurrentBaseHostWithBase()
55 | {
56 | $web = new \Spekulatius\PHPScraper\PHPScraper;
57 |
58 | // Navigate to the test page.
59 | // Contains:
60 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
61 |
62 | // Check the base href being passed through the current base host.
63 | $this->assertSame(
64 | 'https://test-pages-with-base-href.phpscraper.de',
65 | $web->currentBaseHost
66 | );
67 | }
68 |
69 | /**
70 | * Basic processing of the URLs.
71 | *
72 | * @test
73 | */
74 | public function testMakeUrlAbsolute()
75 | {
76 | $web = new \Spekulatius\PHPScraper\PHPScraper;
77 |
78 | // Navigate to test page: This sets the base URL.
79 | $web->go('https://phpscraper.de');
80 |
81 | // Test variations of paths to be processed
82 | // With leading slash
83 | $this->assertSame(
84 | 'https://phpscraper.de/index.html',
85 | $web->makeUrlAbsolute('/index.html'),
86 | );
87 |
88 | // Without leading slash
89 | $this->assertSame(
90 | 'https://phpscraper.de/index.html',
91 | $web->makeUrlAbsolute('index.html'),
92 | );
93 |
94 | // Paths are considered.
95 | $this->assertSame(
96 | 'https://phpscraper.de/test/index.html',
97 | $web->makeUrlAbsolute('test/index.html'),
98 | );
99 |
100 | // Absolutely URLs are untouched.
101 | $this->assertSame(
102 | 'https://example.com/index.html',
103 | $web->makeUrlAbsolute('https://example.com/index.html'),
104 | );
105 |
106 | // Protocol is considered
107 | $this->assertSame(
108 | 'http://example.com/index.html',
109 | $web->makeUrlAbsolute('http://example.com/index.html'),
110 | );
111 | }
112 |
113 | /**
114 | * Basic processing of the URLs.
115 | *
116 | * @test
117 | */
118 | public function testMakeUrlAbsoluteConsiderBaseHref()
119 | {
120 | $web = new \Spekulatius\PHPScraper\PHPScraper;
121 |
122 | /**
123 | * Navigate to test page: This sets the base URL.
124 | *
125 | * It contains:
126 | *
127 | * ```html
128 | *
129 | * ```
130 | *
131 | * While it's located on `test-pages.phpscraper.de`.
132 | *
133 | * This page isn't actually used. It's purely to set the context.
134 | */
135 | $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
136 |
137 | // Test variations of paths to be processed
138 | // With leading slash
139 | $this->assertSame(
140 | 'https://test-pages-with-base-href.phpscraper.de/index.html',
141 | $web->makeUrlAbsolute('/index.html'),
142 | );
143 |
144 | // Without leading slash
145 | $this->assertSame(
146 | 'https://test-pages-with-base-href.phpscraper.de/index.html',
147 | $web->makeUrlAbsolute('index.html'),
148 | );
149 |
150 | // Paths are considered.
151 | $this->assertSame(
152 | 'https://test-pages-with-base-href.phpscraper.de/test/index.html',
153 | $web->makeUrlAbsolute('test/index.html'),
154 | );
155 |
156 | // Absolutely URLs are untouched.
157 | $this->assertSame(
158 | 'https://example.com/index.html',
159 | $web->makeUrlAbsolute('https://example.com/index.html'),
160 | );
161 |
162 | // Protocol is considered
163 | $this->assertSame(
164 | 'http://example.com/index.html',
165 | $web->makeUrlAbsolute('http://example.com/index.html'),
166 | );
167 | }
168 |
169 | /**
170 | * Test if passed in hosts are considered. It trumps any base-href and current url.
171 | *
172 | * @test
173 | */
174 | public function testMakeUrlAbsoluteWithBaseHost()
175 | {
176 | $web = new \Spekulatius\PHPScraper\PHPScraper;
177 |
178 | // Navigate to test page: This sets the base URL.
179 | $web->go('https://phpscraper.de');
180 |
181 | // Test variations of paths to be processed
182 | // With leading slash
183 | $this->assertSame(
184 | 'https://example.com/index.html',
185 | $web->makeUrlAbsolute('/index.html', 'https://example.com'),
186 | );
187 |
188 | // Without leading slash
189 | $this->assertSame(
190 | 'https://example.com/index.html',
191 | $web->makeUrlAbsolute('index.html', 'https://example.com'),
192 | );
193 |
194 | // Paths are considered.
195 | $this->assertSame(
196 | 'https://example.com/test/index.html',
197 | $web->makeUrlAbsolute('test/index.html', 'https://example.com'),
198 | );
199 |
200 | // Absolutely URLs are untouched.
201 | $this->assertSame(
202 | 'https://example.com/index.html',
203 | $web->makeUrlAbsolute('https://example.com/index.html', 'https://example-2.com/test/with/path'),
204 | );
205 |
206 | // Protocol is considered
207 | $this->assertSame(
208 | 'http://example.com/index.html',
209 | $web->makeUrlAbsolute('http://example.com/index.html', 'https://example-2.com/test/with/path'),
210 | );
211 | }
212 | }
213 |
--------------------------------------------------------------------------------