├── .gitignore ├── screenshot.png ├── src ├── __tests__ │ ├── __snapshots__ │ │ ├── bin.js.snap │ │ └── bin.js.md │ ├── bin.ts │ └── index.ts ├── types.graphql └── index.ts ├── examples ├── source.graphql └── hackernews.graphql ├── changelog.md ├── tsconfig.json ├── package.json ├── bin.js ├── readme.md ├── doc └── schema.md └── yarn.lock /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lachenmayer/graphql-scraper/HEAD/screenshot.png -------------------------------------------------------------------------------- /src/__tests__/__snapshots__/bin.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lachenmayer/graphql-scraper/HEAD/src/__tests__/__snapshots__/bin.js.snap -------------------------------------------------------------------------------- /examples/source.graphql: -------------------------------------------------------------------------------- 1 | { 2 | page(source: "
you can pass in HTML!
") { 3 | text(selector: "p") 4 | } 5 | } -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.2.1 4 | 5 | - `graphql` was incorrectly listed as a dev dependency. 6 | 7 | ## 1.2.0 8 | 9 | - Fixed CLI. 10 | 11 | ## 1.1.0 12 | 13 | - Added `childNodes` field 14 | - Added TypeScript declaration file 15 | 16 | ## 1.0.0 17 | 18 | Initial release -------------------------------------------------------------------------------- /examples/hackernews.graphql: -------------------------------------------------------------------------------- 1 | { 2 | page(url: "http://news.ycombinator.com") { 3 | items: queryAll(selector: "tr.athing") { 4 | rank: text(selector: "td span.rank") 5 | title: text(selector: "td.title a") 6 | sitebit: text(selector: "span.comhead a") 7 | url: attr(selector: "td.title a", name: "href") 8 | attrs: next { 9 | score: text(selector: "span.score") 10 | user: text(selector: "a:first-of-type") 11 | comments: text(selector: "a:nth-of-type(3)") 12 | } 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "allowSyntheticDefaultImports": true, 4 | "declaration": true, 5 | "module": "commonjs", 6 | "moduleResolution": "node", 7 | "noImplicitAny": true, 8 | "noUnusedLocals": true, 9 | "outDir": "./build", 10 | "preserveConstEnums": true, 11 | "removeComments": false, 12 | "sourceMap": true, 13 | "strictNullChecks": true, 14 | "suppressImplicitAnyIndexErrors": true, 15 | "target": "es2015" 16 | }, 17 | "include": ["src/**/*"], 18 | "exclude": ["node_modules"] 19 | } 20 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "graphql-scraper", 3 | "version": "1.2.1", 4 | "description": "Extract structured data from the web using GraphQL.", 5 | "author": "harry lachenmayerboom
bapboom
bapboom
bap| Field | 23 |Argument | 24 |Type | 25 |Description | 26 |
|---|---|---|---|
| page | 31 |Document | 32 |33 | | |
| url | 36 |String | 37 |38 | 39 | A URL to fetch the HTML source from. 40 | 41 | | 42 ||
| source | 45 |String | 46 |47 | 48 | A string containing HTML to be used as the source document. 49 | 50 | | 51 ||
| Field | 65 |Argument | 66 |Type | 67 |Description | 68 |
|---|---|---|---|
| content | 73 |String | 74 |75 | 76 | The HTML content of the subnodes 77 | 78 | | 79 ||
| selector | 82 |String | 83 |84 | 85 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 86 | 87 | | 88 ||
| html | 91 |String | 92 |93 | 94 | The HTML content of the selected DOM node 95 | 96 | | 97 ||
| selector | 100 |String | 101 |102 | 103 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 104 | 105 | | 106 ||
| text | 109 |String | 110 |111 | 112 | The text content of the selected DOM node 113 | 114 | | 115 ||
| selector | 118 |String | 119 |120 | 121 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 122 | 123 | | 124 ||
| tag | 127 |String | 128 |129 | 130 | The tag name of the selected DOM node 131 | 132 | | 133 ||
| selector | 136 |String | 137 |138 | 139 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 140 | 141 | | 142 ||
| attr | 145 |String | 146 |147 | 148 | An attribute of the selected node (eg. `href`, `src`, etc.). 149 | 150 | | 151 ||
| selector | 154 |String | 155 |156 | 157 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 158 | 159 | | 160 ||
| name | 163 |String! | 164 |165 | 166 | The name of the attribute 167 | 168 | | 169 ||
| has | 172 |Boolean | 173 |174 | 175 | Returns true if an element with the given selector exists. 176 | 177 | | 178 ||
| selector | 181 |String | 182 |183 | 184 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 185 | 186 | | 187 ||
| query | 190 |Element | 191 |192 | 193 | Equivalent to [Element.querySelector](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector). The selectors of any nested queries will be scoped to the resulting element. 194 | 195 | | 196 ||
| selector | 199 |String | 200 |201 | 202 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 203 | 204 | | 205 ||
| queryAll | 208 |[Element] | 209 |210 | 211 | Equivalent to [Element.querySelectorAll](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelectorAll). The selectors of any nested queries will be scoped to the resulting elements. 212 | 213 | | 214 ||
| selector | 217 |String | 218 |219 | 220 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 221 | 222 | | 223 ||
| children | 226 |[Element] | 227 |228 | 229 | An element's child elements. 230 | 231 | | 232 ||
| parent | 235 |Element | 236 |237 | 238 | An element's parent element. 239 | 240 | | 241 ||
| siblings | 244 |[Element] | 245 |246 | 247 | All elements which are at the same level in the tree as the current element, ie. the children of the current element's parent. Includes the current element. 248 | 249 | | 250 ||
| next | 253 |Element | 254 |255 | 256 | The current element's next sibling. Includes text nodes. Equivalent to [Node.nextSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 257 | 258 | | 259 ||
| nextAll | 262 |[Element] | 263 |264 | 265 | All of the current element's next siblings 266 | 267 | | 268 ||
| previous | 271 |Element | 272 |273 | 274 | The current element's previous sibling. Includes text nodes. Equivalent to [Node.previousSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 275 | 276 | | 277 ||
| previousAll | 280 |[Element] | 281 |282 | 283 | All of the current element's previous siblings 284 | 285 | | 286 ||
| title | 289 |String | 290 |291 | 292 | The page title 293 | 294 | | 295 ||
| Field | 307 |Argument | 308 |Type | 309 |Description | 310 |
|---|---|---|---|
| content | 315 |String | 316 |317 | 318 | The HTML content of the subnodes 319 | 320 | | 321 ||
| selector | 324 |String | 325 |326 | 327 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 328 | 329 | | 330 ||
| html | 333 |String | 334 |335 | 336 | The HTML content of the selected DOM node 337 | 338 | | 339 ||
| selector | 342 |String | 343 |344 | 345 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 346 | 347 | | 348 ||
| text | 351 |String | 352 |353 | 354 | The text content of the selected DOM node 355 | 356 | | 357 ||
| selector | 360 |String | 361 |362 | 363 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 364 | 365 | | 366 ||
| tag | 369 |String | 370 |371 | 372 | The tag name of the selected DOM node 373 | 374 | | 375 ||
| selector | 378 |String | 379 |380 | 381 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 382 | 383 | | 384 ||
| attr | 387 |String | 388 |389 | 390 | An attribute of the selected node (eg. `href`, `src`, etc.). 391 | 392 | | 393 ||
| selector | 396 |String | 397 |398 | 399 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 400 | 401 | | 402 ||
| name | 405 |String! | 406 |407 | 408 | The name of the attribute 409 | 410 | | 411 ||
| has | 414 |Boolean | 415 |416 | 417 | Returns true if an element with the given selector exists. 418 | 419 | | 420 ||
| selector | 423 |String | 424 |425 | 426 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 427 | 428 | | 429 ||
| query | 432 |Element | 433 |434 | 435 | Equivalent to [Element.querySelector](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector). The selectors of any nested queries will be scoped to the resulting element. 436 | 437 | | 438 ||
| selector | 441 |String | 442 |443 | 444 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 445 | 446 | | 447 ||
| queryAll | 450 |[Element] | 451 |452 | 453 | Equivalent to [Element.querySelectorAll](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelectorAll). The selectors of any nested queries will be scoped to the resulting elements. 454 | 455 | | 456 ||
| selector | 459 |String | 460 |461 | 462 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 463 | 464 | | 465 ||
| children | 468 |[Element] | 469 |470 | 471 | An element's child elements. 472 | 473 | | 474 ||
| parent | 477 |Element | 478 |479 | 480 | An element's parent element. 481 | 482 | | 483 ||
| siblings | 486 |[Element] | 487 |488 | 489 | All elements which are at the same level in the tree as the current element, ie. the children of the current element's parent. Includes the current element. 490 | 491 | | 492 ||
| next | 495 |Element | 496 |497 | 498 | The current element's next sibling. Includes text nodes. Equivalent to [Node.nextSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 499 | 500 | | 501 ||
| nextAll | 504 |[Element] | 505 |506 | 507 | All of the current element's next siblings 508 | 509 | | 510 ||
| previous | 513 |Element | 514 |515 | 516 | The current element's previous sibling. Includes text nodes. Equivalent to [Node.previousSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 517 | 518 | | 519 ||
| previousAll | 522 |[Element] | 523 |524 | 525 | All of the current element's previous siblings 526 | 527 | | 528 ||
| visit | 531 |Document | 532 |533 | 534 | If the element is a link, visit the page linked to in the href attribute. 535 | 536 | | 537 ||
| Field | 563 |Argument | 564 |Type | 565 |Description | 566 |
|---|---|---|---|
| content | 571 |String | 572 |573 | 574 | The HTML content of the subnodes 575 | 576 | | 577 ||
| selector | 580 |String | 581 |582 | 583 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 584 | 585 | | 586 ||
| html | 589 |String | 590 |591 | 592 | The HTML content of the selected DOM node 593 | 594 | | 595 ||
| selector | 598 |String | 599 |600 | 601 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 602 | 603 | | 604 ||
| text | 607 |String | 608 |609 | 610 | The text content of the selected DOM node 611 | 612 | | 613 ||
| selector | 616 |String | 617 |618 | 619 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 620 | 621 | | 622 ||
| tag | 625 |String | 626 |627 | 628 | The tag name of the selected DOM node 629 | 630 | | 631 ||
| selector | 634 |String | 635 |636 | 637 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 638 | 639 | | 640 ||
| attr | 643 |String | 644 |645 | 646 | An attribute of the selected node (eg. `href`, `src`, etc.). 647 | 648 | | 649 ||
| selector | 652 |String | 653 |654 | 655 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 656 | 657 | | 658 ||
| name | 661 |String! | 662 |663 | 664 | The name of the attribute 665 | 666 | | 667 ||
| has | 670 |Boolean | 671 |672 | 673 | Returns true if an element with the given selector exists. 674 | 675 | | 676 ||
| selector | 679 |String | 680 |681 | 682 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 683 | 684 | | 685 ||
| query | 688 |Element | 689 |690 | 691 | Equivalent to [Element.querySelector](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector). The selectors of any nested queries will be scoped to the resulting element. 692 | 693 | | 694 ||
| selector | 697 |String | 698 |699 | 700 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 701 | 702 | | 703 ||
| queryAll | 706 |[Element] | 707 |708 | 709 | Equivalent to [Element.querySelectorAll](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelectorAll). The selectors of any nested queries will be scoped to the resulting elements. 710 | 711 | | 712 ||
| selector | 715 |String | 716 |717 | 718 | A [CSS selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors). 719 | 720 | | 721 ||
| children | 724 |[Element] | 725 |726 | 727 | An element's child elements. 728 | 729 | | 730 ||
| parent | 733 |Element | 734 |735 | 736 | An element's parent element. 737 | 738 | | 739 ||
| siblings | 742 |[Element] | 743 |744 | 745 | All elements which are at the same level in the tree as the current element, ie. the children of the current element's parent. Includes the current element. 746 | 747 | | 748 ||
| next | 751 |Element | 752 |753 | 754 | The current element's next sibling. Includes text nodes. Equivalent to [Node.nextSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 755 | 756 | | 757 ||
| nextAll | 760 |[Element] | 761 |762 | 763 | All of the current element's next siblings 764 | 765 | | 766 ||
| previous | 769 |Element | 770 |771 | 772 | The current element's previous sibling. Includes text nodes. Equivalent to [Node.previousSibling](https://developer.mozilla.org/en-US/docs/Web/API/Node/nextSibling). 773 | 774 | | 775 ||
| previousAll | 778 |[Element] | 779 |780 | 781 | All of the current element's previous siblings 782 | 783 | | 784 ||