├── .editorconfig ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .npmrc ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── .nojekyll ├── assets │ ├── highlight.css │ ├── icons.css │ ├── icons.png │ ├── icons@2x.png │ ├── main.js │ ├── search.js │ ├── style.css │ ├── widgets.png │ └── widgets@2x.png ├── index.html ├── interfaces │ ├── CreateRunnerProps.html │ ├── GeosearchResult.html │ ├── RunProps.html │ ├── ScrapeProps.html │ ├── ScraperOptions.html │ └── ScraperProps.html └── modules.html ├── examples └── typescript │ ├── .gitignore │ ├── .prettierrc │ ├── README.md │ ├── package.json │ ├── src │ ├── amsterdam-coffeeshops │ │ └── scraper.ts │ ├── hacker-news │ │ └── scraper.ts │ ├── managed-jsdom │ │ └── scraper.ts │ └── myinstantaudios │ │ └── scraper.ts │ ├── tsconfig.json │ └── yarn.lock ├── package.json ├── scripts ├── codeImport.js └── generateReadme.js ├── src ├── http │ ├── fetchPage.ts │ └── geosearch.ts ├── index.test.ts ├── index.ts ├── scraper │ ├── __fixtures__ │ │ ├── testgeocache │ │ │ ├── 0039f8b7ef5f473fc3c80643193b61f5227303cec6a724a9802a259bb5aef020 │ │ │ ├── 03f6865c192bedf244aa6c5ea85a932f49e1cadd235af6b1c57413606888978f │ │ │ ├── 0812462f763a02c1355c811fa8e8c3484084d701532c5e66b1b6eb85e88ad1b5 │ │ │ ├── 0846132a14aa0ce8893952d4547bf4d5481aa1d71ed3b3b50b81efe833ab5d29 │ │ │ ├── 0865e061da0580ae96eb24647eebd4078d7359ed7480608b6a213749349aa14a │ │ │ ├── 086f7bfc27a9c76968cd048d92dedb33e06ded64e33f820091c51f2d8408874a │ │ │ ├── 0a6b263a3db63b4c2ff99431a9e1481bafb90cb21aad916e428c0012e815df5f │ │ │ ├── 0c544ac3d4edab094567372665274966288cd8cd288b47e52ea94614f9c52311 │ │ │ ├── 10b3d7e47d8ea217e8a0fbe8eea70d91908cd407c178fd9acac02fd5a2f9786e │ │ │ ├── 1495740b7f854575a94e51db8674fcc16c04d487ebc6b926b2f5db87840d6888 │ │ │ ├── 159c45a4e39526bd2f1ff5fdad16a68f6f736a637606cfac1ca381e2159e0434 │ │ │ ├── 16682a84168986c37c3616d2dd97d75bbd7e906a54bb27c8547655c8ee415c2e │ │ │ ├── 18916a50c7bf475ffb0472fbfc93e77c5b8bb8d2487084631ee0402be724dc25 │ │ │ ├── 19233dc26e4a376cb27c1589699bbc020213389d94c3d798d531cb78c8c02ddf │ │ │ ├── 1df29b9f95d1aecafb8705a8f77c730ea61e6d3c5f43dc57341c0a2d8080c578 │ │ │ ├── 1f5330e42d87b16cf2bb2bbc4f1b36dcdf549bf862e925f329444f6ab11b9ad3 │ │ │ ├── 2168bbb398e97d02a86e146618af7abd08ef83c0370a19bfb363f167ed8f6f9b │ │ │ ├── 21b842ae27089920ead9694ce43de0bd3b3a32e8b115e73a47ab458a10cbc984 │ │ │ ├── 2590447e3b2e84d12febf10ad92e459ed3e7f642e64e89139a87a332b042216c │ │ │ ├── 276b07f17baf1ed948c646d915be4b24ed5958585ff641d837462eb04fa75dae │ │ │ ├── 29476a326b54102cdcf21808c6c293d998424eccbb189c2238e2f26d6bab0e6e │ │ │ ├── 2bf98eafbb327e0c92a670393799b5db96e2dacd59b044a28f68dc402bbfd0a9 │ │ │ ├── 2e13e4794d7dba7bb42642d7559ac132c875e0dce30192dd497c5eef56df036c │ │ │ ├── 2f10feb31b446cf536b4d21de782b40e0a71c20e8dce450567ef8928e4f0bdb7 │ │ │ ├── 2fc8ca6ef389738847f5f86c4e50a1179717ab97274582b72ce4445a1edb7aac │ │ │ ├── 3b7022aa71348539fa1cee9459a6ea58725c8ffbb4e5c15398e4768e7db261ca │ │ │ ├── 3ca975e1dd276b100f88537968077d6438b5659632a0c6d3ae5c5f83bd96e032 │ │ │ ├── 3cb066e01161d192b2a0465d1a2ab684af8d4f38972112ebccbf633e0350e484 │ │ │ ├── 3ce2a37f2f2528dd949553e85466a2728e22810f357843a5367058453d573aca │ │ │ ├── 3f0f5debfa379c049135b9acad0401a3b93afb475fec5a66ea1a9542afb9f839 │ │ │ ├── 43a1b6fd5cc0042a803465cff092510e7d1123174042dce331ca98b2956823c1 │ │ │ ├── 447112d290ca7c20d90de2da63021629c157086676f08d73be9e5ffde16edd5e │ │ │ ├── 44ac14bcab44f9d625144c88487af8a73bee2cc89d1c0d4a972b150a50b00930 │ │ │ ├── 47467a74141ec3dafc68ef5947ade64ca3442c5ed674648d69e29fa4d53086a9 │ │ │ ├── 47b020185d20b21b9b78655fa1485df8baf672bb748cbb5c7ff3472ba931ab13 │ │ │ ├── 49f30821ebf1efcb3e3fb9a0f44fb8a218cfad214ad50db76e61bfeefe57ec36 │ │ │ ├── 4baa8d744daa0f0e5b4096efd59f0284dd8a6c25019945cbaf1eba9a873369e6 │ │ │ ├── 4be34a1de09f84d141cda319bfba0ba35f0ba67e936b7635e0a8f03c6374da5c │ │ │ ├── 4d562820676ed7aa893f42423230ae456a1a37912194741ea6c1c9d9e4123f3b │ │ │ ├── 4ee4f89da06c817cb80a6a037dd9138f7c18ac67199156d72f6e2d4f2c49e310 │ │ │ ├── 509877e1e1fd08ea12075ec2d733571e2f9d009c8b2a55fb9f926ae31704d589 │ │ │ ├── 52741613a7ee04eea5b8740920c426a1ffbf9c55ba27d97c35384424ed4e58a7 │ │ │ ├── 52b52c63469184633d1af1f56fd255b9683329264ed2ee11c52dad9a1a48ded3 │ │ │ ├── 539ea28205ec567c2e7f72b6423ccf23ba9f6c81f5971c4ade843776e3e84bdb │ │ │ ├── 53cd7d077a57c9b81622acbf09b628c88ae105eb911e93b70a2a0c66534374d6 │ │ │ ├── 581d10f0b8842522dd78714995682c4c4a3c79ac14c78bd273e407dcb2702f57 │ │ │ ├── 5830a524893ffb18a1049e68550bde23f84f8c42272a80c6b935c9542011a9ff │ │ │ ├── 5b77f6cf60383b79bad8206845892062812c3ca46941f055910b87a3fb3dd2d3 │ │ │ ├── 5d84e577ac3e47a12ac74f950e6920a961ef2213bbbe70defa58379ab3ef221e │ │ │ ├── 5ea874201efb2070f093965db13ac1cb0b40bbae58148957fbf393a957d625f0 │ │ │ ├── 615c3c5585c1d738abf73c0a5509237049cec5bdf585b8d1d1f3894bdb8bf384 │ │ │ ├── 61d8a19d6f9f13aede3bde2686ff536b7caeaaab4675dccc924ca08555c1395e │ │ │ ├── 6212bbf0aff267cee30a12c98deb373a9bbc01211a9e45872ea205ae75e3c988 │ │ │ ├── 6260ca5dcdcc7877fab7ace99fde01e6a7cf75a3b9fe9e6109dfa0e17042cd56 │ │ │ ├── 63e0b19c9f94c8666c8b8e47a662e80e27e3e91db0694bd906d8091c85b778eb │ │ │ ├── 64b05c14e6e5c699c3baa90289035105d716f8445acf67639bd372af94ca6962 │ │ │ ├── 66fe226ba471a9ed95d603e384251109834a45abc21f4b74d975fdfc7b1b7c16 │ │ │ ├── 6a1accdd3a86ba9f8ea9150e9fbe3202613b7c09d1a6449d4ef4825b7fcc700c │ │ │ ├── 6a2f3f92327a2d5e4855d522376a0471f477e64720c81e908af2496be887ba0a │ │ │ ├── 6a7e99f73e5a5a64997acb3f9b72172e12da2fc09fac20c6d19b670d4dfc2d99 │ │ │ ├── 6c08309e8e935fdf8246d2b5d2eabd04b6ec2bcdaa9ac35b2810825ab0b8fb65 │ │ │ ├── 6d37e5a901e039cdd6a3bb9d79467b2e3483c7f327629451588408dccd762370 │ │ │ ├── 6ee26c51cf92b520ed100633b505d2828700afc3f048de9e26c86ece9a6faa7b │ │ │ ├── 732f627132f66d39d986f23d48e0dbaf917f787aaf502805317fea080845fd53 │ │ │ ├── 776ab3dbaa606a6f1617a5b69d06ff4fff9da4b141c52311ac1c5c54119e73cb │ │ │ ├── 795e9d05e106330894fa39a092fa123fdf43bdd5e538529795124b45bb228ba6 │ │ │ ├── 7a85e2edf0c1accc7c510d710f370eb81ef8d3099d5f90a95247e660953b5f71 │ │ │ ├── 7be172f36224113c5bc72f1f9536b693cf8b2cfc16ebe80ed63e3ee10f6154ff │ │ │ ├── 7c60636bc1026a3cd0f76321849377237492631439c7bb4715293a5186140223 │ │ │ ├── 7ef44a2aad2a5f3d1df9ddf4168fcd6e56301bb320b7a48a483d5368b1cf141a │ │ │ ├── 8040cede87d529be0b51b41cfcecd9bd2ad457e56bc37f51c5e85caaa9dc4dc9 │ │ │ ├── 80d586dca48e19177775d45973cd70bd82ef0d09d7ff22f2ef2c89357eb2c56b │ │ │ ├── 81a94845a1aa2fb212ce9b921d62bf6691cc6e02d18d48f243c1ac96f167fc8c │ │ │ ├── 82f1e31ad4ea57b73e33cacd668fafdba14649c492aec656bc239cdbdc5d9adb │ │ │ ├── 847f389e6ed45c4756f5fb08d2b89b951713873d3bab29e5602c087e90e44daa │ │ │ ├── 84d5858d72bd35ae8c8c39775a1871fc44a454900b64d5f6616ff443695cd006 │ │ │ ├── 886f6867352127d8d4f9d1a9232a3e5bf290a687ac7165ce2663ad438111b82b │ │ │ ├── 88a69063a977ea7ce6738ad7b5516fa605a7ef4a156bae2b1c49165bd06b10cc │ │ │ ├── 8e9bea954e6f2fd89b5397b1e90d9d1b1d5ad2bb8f336f42768a6036ddf05456 │ │ │ ├── 8ff420d6e8bb1b82440059d0d6258427d3b87717b0886328bfceb3fe6771fc48 │ │ │ ├── 921fd5cf3c36cb9c5f284badd6c2489d9ee09b5bd2ba4bcb6a6f94c9244e5fa3 │ │ │ ├── 96df13de11537bf367a7820bb4e722751c3af0c5e18945e46dd7eecdcc5ea0d0 │ │ │ ├── 9c6c0d631d904f4c0f959930ddaa6d807af22262e6a55ebe9a2cc5ee7f040640 │ │ │ ├── a40223ee69b3934cf80093c95899d50ede2addf45231ddf93148604f959029b3 │ │ │ ├── a57fddb893880b0fa5563d853073b3169d79f6777f11dcf8d1d0c82e93a7ea3e │ │ │ ├── a82b7aca1ac3b4708e9449cdfb837482eb493a3c94a175d1409e365a8ad097d8 │ │ │ ├── a99f86419c42445e61caee13a8f09414df741caa96d35d8054a699313cfe4bfa │ │ │ ├── aaa2a7e15e6c3c59a4209e4f5f9ebb6b7ce103411cb6bbd3f834bd835d805d71 │ │ │ ├── acf8fa2ef6767dc46746ac4c110ad753a2d2debbfb31860b64199e2de76b3353 │ │ │ ├── ad918b351a653069232d6e175ebc676ed1bb2e9df85a77d356bc20609f008e39 │ │ │ ├── ae4cc26f6fd0e3d4bca9c07a025f80404f99ba5fcbbeaf2df03cd60410895286 │ │ │ ├── afae2cb5f1dd188f6a11c4e3d22c4534e75ecfa0283a38eba2f59004e368c052 │ │ │ ├── b059a3db66fe137c46dc97bdce050f918bc37f928c84b25a57e7db71a96bea55 │ │ │ ├── b234601b6064291f6b38ba4cfa54bdf283706a18b352bd03c10a53246df8620f │ │ │ ├── b41c0a1e6a8e3d75b5e372ea64ff5bc6398e7333593b1783ee0ca17915ee7b2b │ │ │ ├── b473e52ed1608c361af24df03d99b8a7301a8b4f3de6fa26c0a12af26edb86b3 │ │ │ ├── b509fb74d9652dfd3ac88a193e15d9b3ce3787ac0aadaff3944d4fef55209ec5 │ │ │ ├── b9dbba925241dd16c32f89c45fe471171b4c68a83774e016a6ba522925376878 │ │ │ ├── bc526e33f9dc7bcde0ae23acbfb37b96daf1031613dbb644b3ba4e96fa8719b7 │ │ │ ├── bd07bfca38af1872baa418c3d14c03c44cd4b4c8efa8074b05dd1a963c2626f9 │ │ │ ├── be55b63a12c4b9157f7e2b4327b28832347dbc5896757ca0448d633ad006fb94 │ │ │ ├── bf978ffabf761580fb03df4b7a20b7158f2be8cedf4de380f20e4da67e94932e │ │ │ ├── bfee406d72533b4dd03f274c1f3096cba91846295f2d327cae5f13bf1c87fe3d │ │ │ ├── c23e066a9ce8b5046d5f5560c5ac44e55c153ff4f456e4c0e574d3d4d7836d12 │ │ │ ├── c32fcc15a3791eb9cbc79de83e60259e70226750a5a97f45a865a985c983ae3c │ │ │ ├── c3cd9d3e5a8bb951df50701a44ebc3d1abc0cbeeaa2a580eaaaf4c56c6f5ae4f │ │ │ ├── c5e0dd800d036c473a2ab80d849dd82849e2343df88d1287cfa2d651b1596f85 │ │ │ ├── c768c09aa5fbacfc136d04b74c779a5f659e77936f590d832adca32d0200cb49 │ │ │ ├── ca63cc90b22f649ffeb20c63489bec1f5d8f604a53ee38199f5e377c06db195c │ │ │ ├── ccf85a150785ed391d454d5b5440500b3edd1681cc5423a045847439a8ff76de │ │ │ ├── cdbbddd466905a01c2ecf0ddd635ca614d373e969aaf9d5e1011cdc9bc27d9be │ │ │ ├── d0122eab7c54f3994157eb0a1fc4ee7971b6591e964790e1ca5acb28aa9de817 │ │ │ ├── d3caf3820e69ac74dbec4de80472dd83bb11b2efd998e6b6783affd233d149d6 │ │ │ ├── d9dc442d9427d056c1d44ff2ca97e7677eeb551e039bce144cfb6cf0af9d02ad │ │ │ ├── dc9ef75a046a575921382818ba2c917c97a81f1475e18cbe4c16b8dbc8a7ed94 │ │ │ ├── dd13dddae0dfa2cde11eb0a9cb67b9164586bf630aafcb405d415b7a5a163fc9 │ │ │ ├── de29e424a2e0a0da1b6b7a4154723e7db4d55c486687d018158e9081430eefca │ │ │ ├── df4d08a52efcb2490d6510215be76551e359ff8fa4cc9e3388c685774f90a1cd │ │ │ ├── e1228869bcfe79268095e182010ca2da780bded8163c5106e7d06e22ef934de5 │ │ │ ├── e24e02f608ba963bcbfa11e0d450c78143a52c1eae53a6d33bf110f136cd35b6 │ │ │ ├── e253b14111fd8c2dd92c7d8b042a006d9587c025600ac3ef809b9458319e8752 │ │ │ ├── e659a0525d32ee2d8ebf6e43918ba1c6d91e0ce0aa71e1df444c00b3d67c04d9 │ │ │ ├── e6a0fcbf751aff8b2b7466ad861e4610f0a536542865185c94537c0f1bb991dc │ │ │ ├── e8d781d3658104ce9cf35ce4a65c22265af530eca2fa8eca63086d9582f17c13 │ │ │ ├── eae2617293c7b01db0c159cb34af9330e5be00ab6fcc41814f604aae9201547f │ │ │ ├── ecb15a2b447ab52a3e10899b3c8b99a419cafab02f16b9285fe4afe4a679faa3 │ │ │ ├── efc7ff96ccf15cd169249056c0095ab5dbf94a78c1bb3b674721e7c6cdfd4c64 │ │ │ ├── efec83556909f26863444c428582f21b8b6794d07bc21c486bdda5288aa68f9a │ │ │ ├── f33f84d975513b59cf3c05af6dbce3e31c99549288fd8aaad7cfe2a2821d70b5 │ │ │ ├── fa1b24d9aa017ffd09e2f01b104be89f6eff1a2ff28b6f8383fd2e56f337305f │ │ │ ├── fadead3afed615bb416c091586819d45ba96a94a429e44f6a21ecae61c43e9b4 │ │ │ ├── ff1386526e44cfb16ee23bcc03959b8f2f973606c884d4b4d2f7560c81412001 │ │ │ └── ffeccce77c62cfd793f1854b5b905117bd00a79de4fb642f0d6b31bf366ea1c7 │ │ └── testpagecache │ │ │ ├── 0f63a2a5a5620b745938e6a248b49704e580e3c8d9d24a4fb7ec68461f1edd41 │ │ │ ├── 14321f32b05bf98edaa2b9f47bfe4aebc6b52921347d7d83c7edd3ba4f4fc821 │ │ │ ├── 30b6358d12a63f41d2d04d004ac01b7333de240f5fea8503fe330cef68d9c7c8 │ │ │ ├── 325cd4c86652058d26486aa627d1499c9594490a67952dae2017db72265cf8f1 │ │ │ ├── 3f90c17709f75a617f83673f910813c02d682e9b594ff9c18628743dd2663e87 │ │ │ ├── 6b5262edf1b36610d24bd60be6f1ebc776d719d0b52b6ff8fbc479023aafe509 │ │ │ ├── 6dcff0086bb8146a78e7f5dc5217a68fe3dac920b983d7fde451a734a44e521e │ │ │ ├── 749fe6e229fbf8d75a2d429889278db46491017202630fe142686306a8a8501b │ │ │ ├── 7d0bb24b958190382b6e2796d19281475a96a3448df184adce8fdf5a580c16b8 │ │ │ ├── 91f870faf917bacd98c1bd50ab034ed125205a2f0fbcd0576dde28e8f16be634 │ │ │ ├── 9bbbbcd44693e00563b14189f9afa7d6c527a06bbb38973d7e5fa1efe60cedd1 │ │ │ ├── aff3a0e0b65291ab5f57edbcdf830d30338155258a6fca7faff2d6734acf3be7 │ │ │ ├── b12ac077e94d073ed54cdf6b1eb4b099818b6ef2d4e839a54cf9581d563cdad0 │ │ │ ├── b579bf9ecf7bc42724872d2a88039a650b70c61d4e6f7df6a3fd1239933ac148 │ │ │ ├── cf9a7244cfae526284096b0e098d83b744bda835cfb5e4545fffa622e7883920 │ │ │ ├── d553b8972ba01ebb211eb08530f1527df82a6f75209ebe458c4b38221cff198c │ │ │ ├── f7e0f8e4b8d4cc820d0811f0019684856965403af0ae32284c9d0e6e19da3b60 │ │ │ └── fb1671a81c9b01b7cd998a90f4f75cd44ff6546f4a0e98bd60b9830fd6abc061 │ ├── __snapshots__ │ │ └── createScraper.test.ts.snap │ ├── createLogger.ts │ ├── createRunner.ts │ ├── createScraper.test.ts │ ├── createScraper.ts │ ├── scrape.test-d.ts │ └── scrape.ts ├── selectors │ └── createSelectorUtilities.ts └── utilities │ ├── createWindow.ts │ ├── flat.ts │ ├── hash.ts │ ├── mapNodeListToArray.ts │ └── supress.ts ├── template.md ├── tsconfig.json └── yarn.lock /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | charset = utf-8 7 | trim_trailing_whitespace = false 8 | insert_final_newline = false -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push] 3 | jobs: 4 | build: 5 | name: Build, lint, test and release on Node 16 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Checkout repo 9 | uses: actions/checkout@v2 10 | 11 | - name: Use Node 16 12 | uses: actions/setup-node@v1 13 | with: 14 | node-version: 16 15 | 16 | - name: Install dependencies 17 | run: yarn 18 | 19 | - name: Build 20 | run: yarn build 21 | 22 | - name: Lint 23 | run: yarn lint 24 | 25 | - name: Test 26 | run: yarn test --ci --coverage --maxWorkers=2 27 | 28 | - name: Test Types 29 | run: yarn test:types 30 | 31 | - name: Coverage 32 | run: npx codecov -f coverage/*.json 33 | 34 | - name: Regenerate README.md 35 | run: yarn generate:readme 36 | 37 | - name: Regenerate Docs 38 | run: yarn docs 39 | 40 | - name: Pack Inspect 41 | run: yarn pack:inspect 42 | 43 | - name: Release 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 46 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 47 | GH_AUTHOR_NAME: github-actions 48 | GH_AUTHOR_EMAIL: actions@github.com 49 | run: yarn semantic-release 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | .DS_Store 3 | node_modules 4 | dist 5 | pagecache 6 | geocache 7 | coverage 8 | *.tgz 9 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | access=public 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [2.0.5](https://github.com/armand1m/papercut/compare/v2.0.4...v2.0.5) (2021-11-15) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * export utilities and add managed jsdom example ([132038b](https://github.com/armand1m/papercut/commit/132038bd46bf6386b168967925f0cadf8a906241)) 7 | 8 | ## [2.0.4](https://github.com/armand1m/papercut/compare/v2.0.3...v2.0.4) (2021-11-15) 9 | 10 | 11 | ### Bug Fixes 12 | 13 | * make jsdom and pino peer dependencies ([5aabad2](https://github.com/armand1m/papercut/commit/5aabad246c45127f9a3f5b23f18e1aa407410704)) 14 | 15 | ## [2.0.3](https://github.com/armand1m/papercut/compare/v2.0.2...v2.0.3) (2021-11-15) 16 | 17 | 18 | ### Bug Fixes 19 | 20 | * createWindow handler and support for promise result inference ([34787fc](https://github.com/armand1m/papercut/commit/34787fc5f65fca3bfd5925f90b4221a06b57a42c)) 21 | * make type inference promise friendly ([dd04439](https://github.com/armand1m/papercut/commit/dd04439702feed9b3ea61b7b8d221947437ed775)) 22 | 23 | ## [2.0.2](https://github.com/armand1m/papercut/compare/v2.0.1...v2.0.2) (2021-11-14) 24 | 25 | 26 | ### Bug Fixes 27 | 28 | * add tests for pagination and increased coverage, fixes selector utilities ([#9](https://github.com/armand1m/papercut/issues/9)) ([eec651b](https://github.com/armand1m/papercut/commit/eec651bff2f018192d85030da86c017219ca85ab)) 29 | 30 | ## [2.0.1](https://github.com/armand1m/papercut/compare/v2.0.0...v2.0.1) (2021-11-14) 31 | 32 | 33 | ### Bug Fixes 34 | 35 | * improve logs and add tsd type tests ([#8](https://github.com/armand1m/papercut/issues/8)) ([7e64549](https://github.com/armand1m/papercut/commit/7e64549698ef07a5fd7e0c81723b12425107df47)) 36 | 37 | # [2.0.0](https://github.com/armand1m/papercut/compare/v1.0.4...v2.0.0) (2021-11-14) 38 | 39 | 40 | * Merge pull request #7 from armand1m/release/v2 ([798d6d8](https://github.com/armand1m/papercut/commit/798d6d82424c7f2f6ccd3eee5c34004b394a1042)), closes [#7](https://github.com/armand1m/papercut/issues/7) 41 | 42 | 43 | ### Bug Fixes 44 | 45 | * add missing fixtures ([6b5be8a](https://github.com/armand1m/papercut/commit/6b5be8ad78140ca1e8811b5a3fc95ac7a1686506)) 46 | * code import script ([a0e4887](https://github.com/armand1m/papercut/commit/a0e4887cdc7beaf02416b24d7d871c7c7ea2401d)) 47 | * node notifier vulnerabities ([dfab3f3](https://github.com/armand1m/papercut/commit/dfab3f34f9c127f97d9f6db874b43f4654fe5fb8)) 48 | * remove signale and hash geocache ([1ad4a30](https://github.com/armand1m/papercut/commit/1ad4a30bea2fa175e9b6d8063e05bd6e802ee017)) 49 | * test cache path and add hash key ([b7c9446](https://github.com/armand1m/papercut/commit/b7c9446aae1464983288ec5df9f29594ca4f4d59)) 50 | 51 | 52 | ### Features 53 | 54 | * papercut@v2.0.0 ([706cc5c](https://github.com/armand1m/papercut/commit/706cc5c212964e0606ebda66c1d1e2f595e46056)) 55 | * **v2:** new type-safe API introduced ([8276816](https://github.com/armand1m/papercut/commit/827681680736d83918566a9f5c4c7972781117ed)) 56 | 57 | 58 | ### BREAKING CHANGES 59 | 60 | * the entire API changed in favour of a function-based API 61 | instead of classes. 62 | * the entire API changed in favor of a function based API 63 | instead of classes. 64 | 65 | ## [1.0.4](https://github.com/armand1m/papercut/compare/v1.0.3...v1.0.4) (2021-08-29) 66 | 67 | 68 | ### Bug Fixes 69 | 70 | * selector fn types and add initial code for typescript example ([c0afc4b](https://github.com/armand1m/papercut/commit/c0afc4b56553e8a72abaaf4076f6d32ebadb76c8)) 71 | 72 | ## [1.0.3](https://github.com/armand1m/papercut/compare/v1.0.2...v1.0.3) (2021-07-06) 73 | 74 | 75 | ### Bug Fixes 76 | 77 | * github actions build pipeline ([f74a860](https://github.com/armand1m/papercut/commit/f74a86093d34d6837e0450752e39a6e47902eb48)) 78 | 79 | ## [1.0.2](https://github.com/armand1m/papercut/compare/v1.0.1...v1.0.2) (2021-07-06) 80 | 81 | 82 | ### Bug Fixes 83 | 84 | * add .npmrc with public access for scoped package ([72c207f](https://github.com/armand1m/papercut/commit/72c207f972e5bd3872a394f12283689e944588cd)) 85 | 86 | ## [1.0.1](https://github.com/armand1m/papercut/compare/v1.0.0...v1.0.1) (2021-07-06) 87 | 88 | 89 | ### Bug Fixes 90 | 91 | * update all references to use @armand1m/papercut ([536ac15](https://github.com/armand1m/papercut/commit/536ac15105d120fec083fb72c2fc4a99a7596893)) 92 | 93 | # 1.0.0 (2021-07-05) 94 | 95 | 96 | ### Bug Fixes 97 | 98 | * disable codecov temporarily ([354c9fd](https://github.com/armand1m/papercut/commit/354c9fdc843211aa1e9c6db399dd3d5e1e910404)) 99 | * introduce semantic-release and github actions ([f194fcd](https://github.com/armand1m/papercut/commit/f194fcd9259f68cc05c561418578291310a40eef)) 100 | * JSDOM memory leak and concurrency ([352f650](https://github.com/armand1m/papercut/commit/352f650b1d78515f2437eb475d3cef4ae8ec9127)) 101 | * pass with no tests ([f4a0469](https://github.com/armand1m/papercut/commit/f4a04699680c9af2c560147154d99594f98760b2)) 102 | 103 | 104 | ### Features 105 | 106 | * initial commit ([d6d37d0](https://github.com/armand1m/papercut/commit/d6d37d039907d641c69d03bc76848ba6cd857039)) 107 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | First of all, thanks for wanting to contribute to Papercut. The project is fairly small so there is no structure defined. Pull requests are very welcome, with or without the existance of an issue. Issues are very welcome as well as long as descriptive enough. 4 | 5 | ## Getting code merged 6 | 7 | Please open a PR if you want to get some code in the main branch. It will be reviewed and merged eventually, in case we think it's ok. 8 | 9 | ## How do I make changes to this repository? 10 | 11 | Papercut being a library means that you need to follow certain conventions, mainly when writing commit messages. Please get yourself familiarized with Conventional Commits (https://www.conventionalcommits.org/en/v1.0.0/). You'll never bump a version by hand when making changes to papercut, but `semantic-release` will during the CI pipeline based on the commit message, according to the Conventional Commits spec. 12 | 13 | ## Development environment 14 | 15 | You need node. How you make it available in your environment is up to you _(maybe you're into using docker containers for dev environments, for example)_. 16 | 17 | ### Running Tests 18 | 19 | To run tests, run the following command 20 | 21 | ```bash 22 | yarn test 23 | ``` 24 | 25 | ### Run Locally 26 | 27 | Clone the project 28 | 29 | ```bash 30 | git clone https://github.com/armand1m/papercut 31 | ``` 32 | 33 | Go to the project directory 34 | 35 | ```bash 36 | cd papercut 37 | ``` 38 | 39 | Install dependencies 40 | 41 | ```bash 42 | yarn 43 | ``` 44 | 45 | After that, feel free to update the `examples` folder to test your changes. 46 | 47 | Hopefully in the future, this will include a test runner a well to make the development experience a bit more reliable and enjoyable. 48 | 49 | ### Examples 50 | 51 | The examples folder are just node projects as well with their own dependencies. They have `@armand1m/papercut` as a dependency, but linked to your local file system build. 52 | 53 | This means you _need_ to have a build of papercut locally. Just run `yarn build` in the root and you should be fine. 54 | 55 | Once that is done, you can proceed to actually running the example. 56 | 57 | The examples have their own `README.md` with instructions on how to run them. 58 | 59 | 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Armando Magalhães 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Papercut 2 | 3 | [![NPM](https://img.shields.io/npm/v/@armand1m/papercut.svg)](https://www.npmjs.com/package/@armand1m/papercut) 4 | [![codecov](https://codecov.io/gh/armand1m/papercut/branch/master/graph/badge.svg)](https://codecov.io/gh/armand1m/papercut) 5 | [![bundlephobia](https://badgen.net/bundlephobia/min/@armand1m/papercut)](https://bundlephobia.com/result?p=@armand1m/papercut) 6 | [![bundlephobia](https://badgen.net/bundlephobia/minzip/@armand1m/papercut)](https://bundlephobia.com/result?p=@armand1m/papercut) 7 | 8 | > Papercut is a scraping/crawling library for Node.js, written in Typescript. 9 | 10 | Papercut provides a small type-safe and tested foundation that makes it fairly easy to scrape webpages with confidence. 11 | 12 | ## Features 13 | 14 | ### Selectors API 15 | 16 | Inspired by GraphQL Resolvers, Papercut works similarly by allowing you to specify selectors for each scraper runner. 17 | The type definition for the scrape result array items is guaranteed to be compliant with the selectors given. 18 | 19 | ### JSDOM Integration 20 | 21 | Instead of relying on a headless browser engine, papercut relies on JSDOM to process client-side javascript code. This means that Papercut is also able to scrape Single Page Applications *(to a certain extent)*. 22 | 23 | ### Concurrency controls 24 | 25 | Papercut makes usage of Promise Pools to run pagination, node scraping and selector scraping. It comes with sane defaults for simple tasks, but configurable properties to make sure you have the flexibility to suit your needs. 26 | 27 | ### Pagination 28 | 29 | In most cases when web scraping, you're looking to scrape a feed. This feed can be quite long and you might have other challenges like pagination and a hard to predict total number of pages. 30 | 31 | Luckily, most of the time, there is some way to figure the last page number in the UI. Papercut allows you to set a selector to find an element that contains the last page number and a callback for creating the url for each page number using the base url. 32 | 33 | As page urls are not always implemented in the same way, Papercut leaves it up to you to tell it how to build it. 34 | 35 | ### Page Caching 36 | 37 | As many websites introduce rate limits or blocks for scrapers, page caching is a useful feature for scraping. 38 | 39 | Once Papercut hits a page, it stores the payload locally in order to reuse it for subsequent executions. This reduces the need for network requests. 40 | 41 | **Note:** when scraping a big amount of pages, be mindful about disk space. Papercut **does not** handle cache invalidation. 42 | 43 | ### Cached Geosearch 44 | 45 | Sometimes when scraping pages for a list of locations, you might want to convert those into latitude and longitude points. Papercut comes with a geosearch handler with caching that enables you to convert scraped addresses into lat/lng objects. 46 | 47 | To avoid overloading the services that papercut uses for that *(like Nominatin from OpenStreetMap)*, we cache the results to save on subsequent requests and add concurrency limits to comply with rate limits. 48 | 49 | ### Easy for simple tasks, flexible for difficult ones 50 | 51 | Papercut offers a nice selector foundation for basic needs of a scraping tooling. Text, attributes, url, image srcs, and many other handy selectors. 52 | 53 | When you face yourself with a situation where a simple selector wouldn't be enough: you'll still be able to access the element, the window, or even create a new window instance if needed. 54 | 55 | As tasks can grow on complexity, Papercut focus on being a guardrail but not a gatekeeper. 56 | 57 | ## Usage/Examples 58 | 59 | You can find more examples in the `./examples` folder. 60 | 61 | ### Quick example 62 | 63 | Create an empty project with yarn: 64 | 65 | ```sh 66 | mkdir papercut-demo 67 | cd papercut-demo 68 | yarn init -y 69 | ``` 70 | 71 | Add papercut and the needed peer dependencies: 72 | 73 | ```sh 74 | yarn add @armand1m/papercut jsdom pino 75 | ``` 76 | 77 | #### Single page scraper 78 | 79 | For this example, we gonna scrape Hacker News first page. 80 | 81 | Setup a scraper instance and set the selectors using the utilities offered: 82 | 83 | ```ts file=./examples/typescript/src/hacker-news/scraper.ts 84 | import { createScraper } from '@armand1m/papercut'; 85 | 86 | const main = async () => { 87 | const scraper = createScraper({ 88 | name: `Hacker News`, 89 | options: { 90 | log: process.env.DEBUG === 'true', 91 | cache: true, 92 | } 93 | }); 94 | 95 | const results = await scraper.run({ 96 | strict: true, 97 | baseUrl: "https://news.ycombinator.com/", 98 | target: ".athing", 99 | selectors: { 100 | rank: (utils) => { 101 | const value = utils.text('.rank').replace(/^\D+/g, ''); 102 | return Number(value); 103 | }, 104 | name: ({ text }) => text('.titlelink'), 105 | url: ({ href }) => href('.titlelink'), 106 | score: ({ element }) => { 107 | return element.nextElementSibling?.querySelector('.score') 108 | ?.textContent; 109 | }, 110 | createdBy: ({ element }) => { 111 | return element.nextElementSibling?.querySelector('.hnuser') 112 | ?.textContent; 113 | }, 114 | createdAt: ({ element }) => { 115 | return element.nextElementSibling 116 | ?.querySelector('.age') 117 | ?.getAttribute('title'); 118 | }, 119 | } 120 | }); 121 | 122 | console.log(JSON.stringify(results, null, 2)); 123 | }; 124 | 125 | main(); 126 | ``` 127 | 128 | Then run it using `node` or `ts-node`: 129 | 130 | ```sh 131 | npx ts-node ./single-page-scraper.ts 132 | ``` 133 | 134 | #### Paginated scraper 135 | 136 | For this example, because I live in Amsterdam, we gonna scrape the Amsterdam Coffeeshops website for all coffeeshops in Amsterdam. 137 | 138 | Setup a scraper instance and set the selectors using the utilities offered: 139 | 140 | ```ts file=./examples/typescript/src/amsterdam-coffeeshops/scraper.ts 141 | import { createScraper } from '@armand1m/papercut'; 142 | 143 | const createLabeledUrl = (label: string, url: string) => ({ label, url }); 144 | 145 | const main = async () => { 146 | const scraper = createScraper( 147 | { 148 | name: 'Amsterdam Coffeeshops', 149 | options: { 150 | cache: true, 151 | }, 152 | }, 153 | ); 154 | 155 | const results = await scraper.run({ 156 | strict: true, 157 | target: '.summary-box', 158 | baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops', 159 | pagination: { 160 | enabled: true, 161 | lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a', 162 | createPaginatedUrl: (baseUrl, pageNumber) => { 163 | return `${baseUrl}/p:${pageNumber}`; 164 | }, 165 | }, 166 | selectors: { 167 | name: ({ text }) => { 168 | return text('.media-body > h3 > a'); 169 | }, 170 | description: ({ text }) => { 171 | return text('.media-body > .summary-desc'); 172 | }, 173 | photo: ({ src }) => { 174 | return { url: src('.media-left > a > img') }; 175 | }, 176 | phone: ({ text }) => { 177 | return text('.media-right > .contact-info > mark > a'); 178 | }, 179 | address: ({ text }) => { 180 | const address = text('.media-body > address > p'); 181 | 182 | if (!address) { 183 | return undefined; 184 | } 185 | 186 | return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, ''); 187 | }, 188 | location: async (selectors, $this) => { 189 | const address = $this.address(selectors, $this); 190 | return selectors.geosearch(address); 191 | }, 192 | social: ({ href }) => { 193 | const websiteHref = href('.visit-website'); 194 | return websiteHref 195 | ? [createLabeledUrl('Official Website', websiteHref)] 196 | : []; 197 | }, 198 | menus: () => { 199 | /** TODO: scrape menus */ 200 | return []; 201 | }, 202 | badges: ({ all }) => { 203 | const { asArray: badges } = all('.media-left > div > div > img'); 204 | 205 | return badges 206 | .map((badge) => badge.getAttribute('title')) 207 | .filter((badge) => badge !== undefined); 208 | }, 209 | rating: ({ className }) => { 210 | const rateNumber = className( 211 | '.media-right > .summary-info > span > span' 212 | ); 213 | 214 | if (!rateNumber) { 215 | return 0; 216 | } 217 | 218 | return Number(rateNumber.replace('rate-', '')); 219 | }, 220 | } 221 | }); 222 | 223 | console.log(JSON.stringify(results, null, 2)); 224 | }; 225 | 226 | main(); 227 | ``` 228 | 229 | Then run it using `node` or `ts-node`: 230 | 231 | ```sh 232 | npx ts-node ./paginated-scraper.ts 233 | ``` 234 | 235 | #### Managed JSDOM 236 | 237 | In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the `scrape` function instead. 238 | 239 | In the example below, we use the exposed `createWindow` and `fetchPage` utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired. 240 | 241 | ```ts file=./examples/typescript/src/managed-jsdom/scraper.ts 242 | import pino from 'pino' 243 | import { scrape, fetchPage, createWindow } from '@armand1m/papercut'; 244 | 245 | const main = async () => { 246 | const logger = pino({ 247 | name: 'Hacker News', 248 | enabled: false 249 | }); 250 | 251 | const rawHTML = await fetchPage('https://news.ycombinator.com/') 252 | const window = createWindow(rawHTML); 253 | 254 | const results = await scrape({ 255 | strict: true, 256 | logger, 257 | document: window.document, 258 | target: ".athing", 259 | selectors: { 260 | rank: (utils) => { 261 | const value = utils.text('.rank').replace(/^\D+/g, ''); 262 | return Number(value); 263 | }, 264 | name: ({ text }) => text('.titlelink'), 265 | url: ({ href }) => href('.titlelink'), 266 | score: ({ element }) => { 267 | return element.nextElementSibling?.querySelector('.score') 268 | ?.textContent; 269 | }, 270 | createdBy: ({ element }) => { 271 | return element.nextElementSibling?.querySelector('.hnuser') 272 | ?.textContent; 273 | }, 274 | createdAt: ({ element }) => { 275 | return element.nextElementSibling 276 | ?.querySelector('.age') 277 | ?.getAttribute('title'); 278 | }, 279 | }, 280 | options: { 281 | log: false, 282 | cache: true, 283 | concurrency: { 284 | page: 2, 285 | node: 2, 286 | selector: 2 287 | } 288 | } 289 | }); 290 | 291 | window.close(); 292 | 293 | console.log(JSON.stringify(results, null, 2)); 294 | }; 295 | 296 | main(); 297 | ``` 298 | 299 | Then run it using `node` or `ts-node`: 300 | 301 | ```sh 302 | npx ts-node ./managed-jsdom.ts 303 | ``` 304 | 305 | ## API Reference 306 | 307 | [Click here to open the API reference.](https://armand1m.github.io/papercut) 308 | 309 | ## Environment Variables 310 | 311 | Papercut works well out of the box, but some environment variables are available for customizing behavior: 312 | 313 | `DEBUG=true`: enables debug level logs. 314 | 315 | ## Roadmap 316 | 317 | * [x] Add unit tests 318 | * [x] Add documentation generation 319 | * [x] Create a gh-pages for the library 320 | * [x] Create more examples 321 | * [ ] Create medium article introducing the library 322 | 323 | ## Contributing 324 | 325 | Contributions are always welcome! 326 | 327 | See `CONTRIBUTING.md` for ways to get started. 328 | 329 | ## FAQ 330 | 331 | #### Why not use `puppeteer`, `selenium` or `webdriver`? 332 | 333 | JSDOM is lighter and easier than using a headless browser engine and *(I hope that it)* allows for enough scraping capabilities. Setup is minimal and it works out-of-the box with minimal overhead to users of this library. Please open an issue if you'd like to discuss more about this, I can definitely be wrong. 334 | 335 | #### Why not use `cheerio`? 336 | 337 | I like the idea. I see papercut being flexible in the future to use different engines, so you'd be able to switch from JSDOM to cheerio, though I'm not sure if I see much value on it. Please open an issue if you'd like to discuss a possible API implementation here. 338 | 339 | ## Contributors 340 | 341 | | Website | Name | 342 | | ---------------------- | --------------------- | 343 | | | **Armando Magalhaes** | 344 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false. -------------------------------------------------------------------------------- /docs/assets/highlight.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --light-hl-0: #000000; 3 | --dark-hl-0: #D4D4D4; 4 | --light-hl-1: #795E26; 5 | --dark-hl-1: #DCDCAA; 6 | --light-hl-2: #AF00DB; 7 | --dark-hl-2: #C586C0; 8 | --light-hl-3: #001080; 9 | --dark-hl-3: #9CDCFE; 10 | --light-hl-4: #A31515; 11 | --dark-hl-4: #CE9178; 12 | --light-hl-5: #0000FF; 13 | --dark-hl-5: #569CD6; 14 | --light-hl-6: #0070C1; 15 | --dark-hl-6: #4FC1FF; 16 | --light-hl-7: #811F3F; 17 | --dark-hl-7: #D16969; 18 | --light-hl-8: #EE0000; 19 | --dark-hl-8: #DCDCAA; 20 | --light-hl-9: #000000; 21 | --dark-hl-9: #D7BA7D; 22 | --light-hl-10: #267F99; 23 | --dark-hl-10: #4EC9B0; 24 | --light-hl-11: #098658; 25 | --dark-hl-11: #B5CEA8; 26 | --light-hl-12: #008000; 27 | --dark-hl-12: #6A9955; 28 | --light-code-background: #FFFFFF; 29 | --dark-code-background: #1E1E1E; 30 | } 31 | 32 | @media (prefers-color-scheme: light) { :root { 33 | --hl-0: var(--light-hl-0); 34 | --hl-1: var(--light-hl-1); 35 | --hl-2: var(--light-hl-2); 36 | --hl-3: var(--light-hl-3); 37 | --hl-4: var(--light-hl-4); 38 | --hl-5: var(--light-hl-5); 39 | --hl-6: var(--light-hl-6); 40 | --hl-7: var(--light-hl-7); 41 | --hl-8: var(--light-hl-8); 42 | --hl-9: var(--light-hl-9); 43 | --hl-10: var(--light-hl-10); 44 | --hl-11: var(--light-hl-11); 45 | --hl-12: var(--light-hl-12); 46 | --code-background: var(--light-code-background); 47 | } } 48 | 49 | @media (prefers-color-scheme: dark) { :root { 50 | --hl-0: var(--dark-hl-0); 51 | --hl-1: var(--dark-hl-1); 52 | --hl-2: var(--dark-hl-2); 53 | --hl-3: var(--dark-hl-3); 54 | --hl-4: var(--dark-hl-4); 55 | --hl-5: var(--dark-hl-5); 56 | --hl-6: var(--dark-hl-6); 57 | --hl-7: var(--dark-hl-7); 58 | --hl-8: var(--dark-hl-8); 59 | --hl-9: var(--dark-hl-9); 60 | --hl-10: var(--dark-hl-10); 61 | --hl-11: var(--dark-hl-11); 62 | --hl-12: var(--dark-hl-12); 63 | --code-background: var(--dark-code-background); 64 | } } 65 | 66 | body.light { 67 | --hl-0: var(--light-hl-0); 68 | --hl-1: var(--light-hl-1); 69 | --hl-2: var(--light-hl-2); 70 | --hl-3: var(--light-hl-3); 71 | --hl-4: var(--light-hl-4); 72 | --hl-5: var(--light-hl-5); 73 | --hl-6: var(--light-hl-6); 74 | --hl-7: var(--light-hl-7); 75 | --hl-8: var(--light-hl-8); 76 | --hl-9: var(--light-hl-9); 77 | --hl-10: var(--light-hl-10); 78 | --hl-11: var(--light-hl-11); 79 | --hl-12: var(--light-hl-12); 80 | --code-background: var(--light-code-background); 81 | } 82 | 83 | body.dark { 84 | --hl-0: var(--dark-hl-0); 85 | --hl-1: var(--dark-hl-1); 86 | --hl-2: var(--dark-hl-2); 87 | --hl-3: var(--dark-hl-3); 88 | --hl-4: var(--dark-hl-4); 89 | --hl-5: var(--dark-hl-5); 90 | --hl-6: var(--dark-hl-6); 91 | --hl-7: var(--dark-hl-7); 92 | --hl-8: var(--dark-hl-8); 93 | --hl-9: var(--dark-hl-9); 94 | --hl-10: var(--dark-hl-10); 95 | --hl-11: var(--dark-hl-11); 96 | --hl-12: var(--dark-hl-12); 97 | --code-background: var(--dark-code-background); 98 | } 99 | 100 | .hl-0 { color: var(--hl-0); } 101 | .hl-1 { color: var(--hl-1); } 102 | .hl-2 { color: var(--hl-2); } 103 | .hl-3 { color: var(--hl-3); } 104 | .hl-4 { color: var(--hl-4); } 105 | .hl-5 { color: var(--hl-5); } 106 | .hl-6 { color: var(--hl-6); } 107 | .hl-7 { color: var(--hl-7); } 108 | .hl-8 { color: var(--hl-8); } 109 | .hl-9 { color: var(--hl-9); } 110 | .hl-10 { color: var(--hl-10); } 111 | .hl-11 { color: var(--hl-11); } 112 | .hl-12 { color: var(--hl-12); } 113 | pre, code { background: var(--code-background); } 114 | -------------------------------------------------------------------------------- /docs/assets/icons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armand1m/papercut/8e3eca627c4468eef365c543ba67ac086c4d9be7/docs/assets/icons.png -------------------------------------------------------------------------------- /docs/assets/icons@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armand1m/papercut/8e3eca627c4468eef365c543ba67ac086c4d9be7/docs/assets/icons@2x.png -------------------------------------------------------------------------------- /docs/assets/search.js: -------------------------------------------------------------------------------- 1 | window.searchData = {"kinds":{"64":"Function","256":"Interface","1024":"Property","65536":"Type literal","4194304":"Type alias"},"rows":[{"id":0,"kind":64,"name":"createScraper","url":"modules.html#createScraper","classes":"tsd-kind-function"},{"id":1,"kind":4194304,"name":"Scraper","url":"modules.html#Scraper","classes":"tsd-kind-type-alias"},{"id":2,"kind":256,"name":"ScraperOptions","url":"interfaces/ScraperOptions.html","classes":"tsd-kind-interface"},{"id":3,"kind":1024,"name":"log","url":"interfaces/ScraperOptions.html#log","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":4,"kind":1024,"name":"cache","url":"interfaces/ScraperOptions.html#cache","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":5,"kind":1024,"name":"concurrency","url":"interfaces/ScraperOptions.html#concurrency","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":6,"kind":65536,"name":"__type","url":"interfaces/ScraperOptions.html#__type","classes":"tsd-kind-type-literal tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":7,"kind":1024,"name":"page","url":"interfaces/ScraperOptions.html#__type.page","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":8,"kind":1024,"name":"node","url":"interfaces/ScraperOptions.html#__type.node","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":9,"kind":1024,"name":"selector","url":"interfaces/ScraperOptions.html#__type.selector","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":10,"kind":256,"name":"ScraperProps","url":"interfaces/ScraperProps.html","classes":"tsd-kind-interface"},{"id":11,"kind":1024,"name":"name","url":"interfaces/ScraperProps.html#name","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":12,"kind":1024,"name":"options","url":"interfaces/ScraperProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":13,"kind":64,"name":"createRunner","url":"modules.html#createRunner","classes":"tsd-kind-function"},{"id":14,"kind":4194304,"name":"SelectorMap","url":"modules.html#SelectorMap","classes":"tsd-kind-type-alias"},{"id":15,"kind":4194304,"name":"SelectorFunction","url":"modules.html#SelectorFunction","classes":"tsd-kind-type-alias"},{"id":16,"kind":65536,"name":"__type","url":"modules.html#SelectorFunction.__type","classes":"tsd-kind-type-literal tsd-parent-kind-type-alias","parent":"SelectorFunction"},{"id":17,"kind":256,"name":"CreateRunnerProps","url":"interfaces/CreateRunnerProps.html","classes":"tsd-kind-interface"},{"id":18,"kind":1024,"name":"logger","url":"interfaces/CreateRunnerProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":19,"kind":1024,"name":"options","url":"interfaces/CreateRunnerProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":20,"kind":256,"name":"RunProps","url":"interfaces/RunProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":21,"kind":1024,"name":"strict","url":"interfaces/RunProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":22,"kind":1024,"name":"baseUrl","url":"interfaces/RunProps.html#baseUrl","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":23,"kind":1024,"name":"target","url":"interfaces/RunProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":24,"kind":1024,"name":"selectors","url":"interfaces/RunProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":25,"kind":1024,"name":"pagination","url":"interfaces/RunProps.html#pagination","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":26,"kind":64,"name":"scrape","url":"modules.html#scrape","classes":"tsd-kind-function tsd-has-type-parameter"},{"id":27,"kind":256,"name":"ScrapeProps","url":"interfaces/ScrapeProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":28,"kind":1024,"name":"strict","url":"interfaces/ScrapeProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":29,"kind":1024,"name":"target","url":"interfaces/ScrapeProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":30,"kind":1024,"name":"document","url":"interfaces/ScrapeProps.html#document","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":31,"kind":1024,"name":"selectors","url":"interfaces/ScrapeProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":32,"kind":1024,"name":"logger","url":"interfaces/ScrapeProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":33,"kind":1024,"name":"options","url":"interfaces/ScrapeProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":34,"kind":4194304,"name":"ScrapeResultType","url":"modules.html#ScrapeResultType","classes":"tsd-kind-type-alias tsd-has-type-parameter"},{"id":35,"kind":64,"name":"createSelectorUtilities","url":"modules.html#createSelectorUtilities","classes":"tsd-kind-function"},{"id":36,"kind":4194304,"name":"SelectorUtilities","url":"modules.html#SelectorUtilities","classes":"tsd-kind-type-alias"},{"id":37,"kind":64,"name":"geosearch","url":"modules.html#geosearch","classes":"tsd-kind-function"},{"id":38,"kind":256,"name":"GeosearchResult","url":"interfaces/GeosearchResult.html","classes":"tsd-kind-interface"},{"id":39,"kind":1024,"name":"latitude","url":"interfaces/GeosearchResult.html#latitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":40,"kind":1024,"name":"longitude","url":"interfaces/GeosearchResult.html#longitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":41,"kind":64,"name":"fetchPage","url":"modules.html#fetchPage","classes":"tsd-kind-function"},{"id":42,"kind":64,"name":"createWindow","url":"modules.html#createWindow","classes":"tsd-kind-function"}],"index":{"version":"2.3.9","fields":["name","parent"],"fieldVectors":[["name/0",[0,33.787]],["parent/0",[]],["name/1",[1,33.787]],["parent/1",[]],["name/2",[2,20.794]],["parent/2",[]],["name/3",[3,33.787]],["parent/3",[2,1.606]],["name/4",[4,33.787]],["parent/4",[2,1.606]],["name/5",[5,33.787]],["parent/5",[2,1.606]],["name/6",[6,28.679]],["parent/6",[2,1.606]],["name/7",[7,33.787]],["parent/7",[8,1.955]],["name/8",[9,33.787]],["parent/8",[8,1.955]],["name/9",[10,33.787]],["parent/9",[8,1.955]],["name/10",[11,25.314]],["parent/10",[]],["name/11",[12,33.787]],["parent/11",[11,1.955]],["name/12",[13,25.314]],["parent/12",[11,1.955]],["name/13",[14,33.787]],["parent/13",[]],["name/14",[15,33.787]],["parent/14",[]],["name/15",[16,28.679]],["parent/15",[]],["name/16",[6,28.679]],["parent/16",[16,2.215]],["name/17",[17,25.314]],["parent/17",[]],["name/18",[18,28.679]],["parent/18",[17,1.955]],["name/19",[13,25.314]],["parent/19",[17,1.955]],["name/20",[19,19.124]],["parent/20",[]],["name/21",[20,28.679]],["parent/21",[19,1.477]],["name/22",[21,33.787]],["parent/22",[19,1.477]],["name/23",[22,28.679]],["parent/23",[19,1.477]],["name/24",[23,28.679]],["parent/24",[19,1.477]],["name/25",[24,33.787]],["parent/25",[19,1.477]],["name/26",[25,33.787]],["parent/26",[]],["name/27",[26,17.693]],["parent/27",[]],["name/28",[20,28.679]],["parent/28",[26,1.367]],["name/29",[22,28.679]],["parent/29",[26,1.367]],["name/30",[27,33.787]],["parent/30",[26,1.367]],["name/31",[23,28.679]],["parent/31",[26,1.367]],["name/32",[18,28.679]],["parent/32",[26,1.367]],["name/33",[13,25.314]],["parent/33",[26,1.367]],["name/34",[28,33.787]],["parent/34",[]],["name/35",[29,33.787]],["parent/35",[]],["name/36",[30,33.787]],["parent/36",[]],["name/37",[31,33.787]],["parent/37",[]],["name/38",[32,25.314]],["parent/38",[]],["name/39",[33,33.787]],["parent/39",[32,1.955]],["name/40",[34,33.787]],["parent/40",[32,1.955]],["name/41",[35,33.787]],["parent/41",[]],["name/42",[36,33.787]],["parent/42",[]]],"invertedIndex":[["__type",{"_index":6,"name":{"6":{},"16":{}},"parent":{}}],["baseurl",{"_index":21,"name":{"22":{}},"parent":{}}],["cache",{"_index":4,"name":{"4":{}},"parent":{}}],["concurrency",{"_index":5,"name":{"5":{}},"parent":{}}],["createrunner",{"_index":14,"name":{"13":{}},"parent":{}}],["createrunnerprops",{"_index":17,"name":{"17":{}},"parent":{"18":{},"19":{}}}],["createscraper",{"_index":0,"name":{"0":{}},"parent":{}}],["createselectorutilities",{"_index":29,"name":{"35":{}},"parent":{}}],["createwindow",{"_index":36,"name":{"42":{}},"parent":{}}],["document",{"_index":27,"name":{"30":{}},"parent":{}}],["fetchpage",{"_index":35,"name":{"41":{}},"parent":{}}],["geosearch",{"_index":31,"name":{"37":{}},"parent":{}}],["geosearchresult",{"_index":32,"name":{"38":{}},"parent":{"39":{},"40":{}}}],["latitude",{"_index":33,"name":{"39":{}},"parent":{}}],["log",{"_index":3,"name":{"3":{}},"parent":{}}],["logger",{"_index":18,"name":{"18":{},"32":{}},"parent":{}}],["longitude",{"_index":34,"name":{"40":{}},"parent":{}}],["name",{"_index":12,"name":{"11":{}},"parent":{}}],["node",{"_index":9,"name":{"8":{}},"parent":{}}],["options",{"_index":13,"name":{"12":{},"19":{},"33":{}},"parent":{}}],["page",{"_index":7,"name":{"7":{}},"parent":{}}],["pagination",{"_index":24,"name":{"25":{}},"parent":{}}],["runprops",{"_index":19,"name":{"20":{}},"parent":{"21":{},"22":{},"23":{},"24":{},"25":{}}}],["scrape",{"_index":25,"name":{"26":{}},"parent":{}}],["scrapeprops",{"_index":26,"name":{"27":{}},"parent":{"28":{},"29":{},"30":{},"31":{},"32":{},"33":{}}}],["scraper",{"_index":1,"name":{"1":{}},"parent":{}}],["scraperesulttype",{"_index":28,"name":{"34":{}},"parent":{}}],["scraperoptions",{"_index":2,"name":{"2":{}},"parent":{"3":{},"4":{},"5":{},"6":{}}}],["scraperoptions.__type",{"_index":8,"name":{},"parent":{"7":{},"8":{},"9":{}}}],["scraperprops",{"_index":11,"name":{"10":{}},"parent":{"11":{},"12":{}}}],["selector",{"_index":10,"name":{"9":{}},"parent":{}}],["selectorfunction",{"_index":16,"name":{"15":{}},"parent":{"16":{}}}],["selectormap",{"_index":15,"name":{"14":{}},"parent":{}}],["selectors",{"_index":23,"name":{"24":{},"31":{}},"parent":{}}],["selectorutilities",{"_index":30,"name":{"36":{}},"parent":{}}],["strict",{"_index":20,"name":{"21":{},"28":{}},"parent":{}}],["target",{"_index":22,"name":{"23":{},"29":{}},"parent":{}}]],"pipeline":[]}} -------------------------------------------------------------------------------- /docs/assets/widgets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armand1m/papercut/8e3eca627c4468eef365c543ba67ac086c4d9be7/docs/assets/widgets.png -------------------------------------------------------------------------------- /docs/assets/widgets@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armand1m/papercut/8e3eca627c4468eef365c543ba67ac086c4d9be7/docs/assets/widgets@2x.png -------------------------------------------------------------------------------- /docs/interfaces/CreateRunnerProps.html: -------------------------------------------------------------------------------- 1 | CreateRunnerProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface CreateRunnerProps

Hierarchy

  • CreateRunnerProps

Index

Properties

Properties

logger

logger: Logger
2 |

A pino.Logger instance.

3 |

options

4 |

The scraper options. 5 | Use this to tweak log, cache and concurrency settings.

6 |

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/GeosearchResult.html: -------------------------------------------------------------------------------- 1 | GeosearchResult | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface GeosearchResult

Hierarchy

  • GeosearchResult

Index

Properties

latitude

latitude: number

longitude

longitude: number

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/RunProps.html: -------------------------------------------------------------------------------- 1 | RunProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface RunProps<T, B>

Type parameters

Hierarchy

  • RunProps

Index

Properties

baseUrl

baseUrl: string
2 |

The base url to start scraping off.

3 |

This page will be fetched, parsed and mounted in a virtual JSDOM instance.

4 |

Optional pagination

pagination?: PaginationOptions
5 |

Optional pagination feature.

6 |

If enabled and configured, this will make papercut 7 | fetch, parse, mount and scrape multiple pages based 8 | on a URL creation pattern.

9 |

As long as you have a way to fetch the last page number 10 | from the page you're scraping, and use it as a query param 11 | in the page url, you should be fine.

12 |

selectors

selectors: T
13 |

The selectors to be used during the scraping process.

14 |

The result object will match the schema of the selectors.

15 |

strict

strict: B
16 |

If enabled, this will make Papercut scrape the page in strict mode. 17 | This means that in case a selector function fails, the entire scraping will 18 | be halted with an error.

19 |

When enabled, the result types will not expect undefined values.

20 |

target

target: string
21 |

The DOM selector for the target nodes to be scraped.

22 |

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/ScrapeProps.html: -------------------------------------------------------------------------------- 1 | ScrapeProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScrapeProps<T, B>

Type parameters

Hierarchy

  • ScrapeProps

Index

Properties

document

document: Document

logger

logger: Logger

options

selectors

selectors: T

strict

strict: B

target

target: string

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/ScraperOptions.html: -------------------------------------------------------------------------------- 1 | ScraperOptions | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperOptions

Hierarchy

  • ScraperOptions

Index

Properties

cache

cache: boolean
2 |

Enables HTML payload caching on the disk. 3 | Keep in mind that papercut will not clear the cache for you. 4 | When enabling this, it's your responsability to deal with cache invalidation.

5 |
default

false

6 |

concurrency

concurrency: { node: number; page: number; selector: number }
7 |

Concurrency settings.

8 |

Type declaration

  • node: number
    9 |

    Amount of concurrent promises for node scraping.

    10 |
    default

    2

    11 |
  • page: number
    12 |

    Amount of concurrent promises for page scraping.

    13 |
    default

    2

    14 |
  • selector: number
    15 |

    Amount of concurrent promises for selector scraping.

    16 |
    default

    2

    17 |

log

log: boolean
18 |

Enables writing pino logs to the stdout.

19 |
default

process.env.DEBUG === "true"

20 |

Generated using TypeDoc

-------------------------------------------------------------------------------- /docs/interfaces/ScraperProps.html: -------------------------------------------------------------------------------- 1 | ScraperProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperProps

Hierarchy

  • ScraperProps

Index

Properties

Properties

name

name: string
2 |

The scraper name. 3 | This will be used only for logging purposes.

4 |

Optional options

options?: Partial<ScraperOptions>
5 |

The scraper options. 6 | Use this to tweak log, cache and concurrency settings.

7 |

Generated using TypeDoc

-------------------------------------------------------------------------------- /examples/typescript/.gitignore: -------------------------------------------------------------------------------- 1 | pagecache 2 | geocache 3 | node_modules 4 | -------------------------------------------------------------------------------- /examples/typescript/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "jsxBracketSameLine": true, 3 | "printWidth": 70, 4 | "singleQuote": true, 5 | "trailingComma": "es5" 6 | } 7 | -------------------------------------------------------------------------------- /examples/typescript/README.md: -------------------------------------------------------------------------------- 1 | # typescript-papercut-examples 2 | 3 | This is a Node v16 project demonstrating how to use papercut with typescript. 4 | 5 | # Scrapers available: 6 | 7 | - Hacker News: Scrapes the first page of Hacker News. 8 | 9 | # Running a scraper 10 | 11 | In this case, make sure you prepared `papercut` in the root directory and built it. 12 | 13 | Once done, run `yarn` in this folder. 14 | 15 | After that, you should be able to run any scraper: 16 | 17 | ```sh 18 | yarn run-scraper ./src/hacker-news/scraper.ts 19 | ``` 20 | 21 | In case you want to write the output to a file, run it with the `--silent` flag to silence yarn noise: 22 | 23 | ```sh 24 | yarn --silent run-scraper ./src/hacker-news/scraper.ts 25 | ``` 26 | 27 | To see debug information, run the scraper with the env var `DEBUG=true`: 28 | 29 | ```sh 30 | DEBUG=true yarn --silent run-scraper ./src/hacker-news/scraper.ts 31 | ``` 32 | -------------------------------------------------------------------------------- /examples/typescript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "papercut-typescript-example", 3 | "version": "0.0.0", 4 | "description": "Example on how to use papercut with typescript", 5 | "author": "Armando Magalhães", 6 | "license": "MIT", 7 | "devDependencies": { 8 | "@types/node": "^16.7.5", 9 | "nodemon": "^2.0.12", 10 | "prettier": "^2.3.2", 11 | "ts-node": "^10.2.1", 12 | "typescript": "^4.4.4" 13 | }, 14 | "dependencies": { 15 | "@armand1m/papercut": "file:../..", 16 | "jsdom": "^18.1.0", 17 | "pino": "^7.2.0", 18 | "tslib": "^2.3.1" 19 | }, 20 | "scripts": { 21 | "build": "tsc", 22 | "run-scraper": "ts-node", 23 | "lint": "prettier --check './src/**/*.{tsx,ts}'", 24 | "lint:fix": "prettier --write './src/**/*.{tsx,ts}'" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/typescript/src/amsterdam-coffeeshops/scraper.ts: -------------------------------------------------------------------------------- 1 | import { createScraper } from '@armand1m/papercut'; 2 | 3 | const createLabeledUrl = (label: string, url: string) => ({ label, url }); 4 | 5 | const main = async () => { 6 | const scraper = createScraper( 7 | { 8 | name: 'Amsterdam Coffeeshops', 9 | options: { 10 | cache: true, 11 | }, 12 | }, 13 | ); 14 | 15 | const results = await scraper.run({ 16 | strict: true, 17 | target: '.summary-box', 18 | baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops', 19 | pagination: { 20 | enabled: true, 21 | lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a', 22 | createPaginatedUrl: (baseUrl, pageNumber) => { 23 | return `${baseUrl}/p:${pageNumber}`; 24 | }, 25 | }, 26 | selectors: { 27 | name: ({ text }) => { 28 | return text('.media-body > h3 > a'); 29 | }, 30 | description: ({ text }) => { 31 | return text('.media-body > .summary-desc'); 32 | }, 33 | photo: ({ src }) => { 34 | return { url: src('.media-left > a > img') }; 35 | }, 36 | phone: ({ text }) => { 37 | return text('.media-right > .contact-info > mark > a'); 38 | }, 39 | address: ({ text }) => { 40 | const address = text('.media-body > address > p'); 41 | 42 | if (!address) { 43 | return undefined; 44 | } 45 | 46 | return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, ''); 47 | }, 48 | location: async (selectors, $this) => { 49 | const address = $this.address(selectors, $this); 50 | return selectors.geosearch(address); 51 | }, 52 | social: ({ href }) => { 53 | const websiteHref = href('.visit-website'); 54 | return websiteHref 55 | ? [createLabeledUrl('Official Website', websiteHref)] 56 | : []; 57 | }, 58 | menus: () => { 59 | /** TODO: scrape menus */ 60 | return []; 61 | }, 62 | badges: ({ all }) => { 63 | const { asArray: badges } = all('.media-left > div > div > img'); 64 | 65 | return badges 66 | .map((badge) => badge.getAttribute('title')) 67 | .filter((badge) => badge !== undefined); 68 | }, 69 | rating: ({ className }) => { 70 | const rateNumber = className( 71 | '.media-right > .summary-info > span > span' 72 | ); 73 | 74 | if (!rateNumber) { 75 | return 0; 76 | } 77 | 78 | return Number(rateNumber.replace('rate-', '')); 79 | }, 80 | } 81 | }); 82 | 83 | console.log(JSON.stringify(results, null, 2)); 84 | }; 85 | 86 | main(); 87 | -------------------------------------------------------------------------------- /examples/typescript/src/hacker-news/scraper.ts: -------------------------------------------------------------------------------- 1 | import { createScraper } from '@armand1m/papercut'; 2 | 3 | const main = async () => { 4 | const scraper = createScraper({ 5 | name: `Hacker News`, 6 | options: { 7 | log: process.env.DEBUG === 'true', 8 | cache: true, 9 | } 10 | }); 11 | 12 | const results = await scraper.run({ 13 | strict: true, 14 | baseUrl: "https://news.ycombinator.com/", 15 | target: ".athing", 16 | selectors: { 17 | rank: (utils) => { 18 | const value = utils.text('.rank').replace(/^\D+/g, ''); 19 | return Number(value); 20 | }, 21 | name: ({ text }) => text('.titlelink'), 22 | url: ({ href }) => href('.titlelink'), 23 | score: ({ element }) => { 24 | return element.nextElementSibling?.querySelector('.score') 25 | ?.textContent; 26 | }, 27 | createdBy: ({ element }) => { 28 | return element.nextElementSibling?.querySelector('.hnuser') 29 | ?.textContent; 30 | }, 31 | createdAt: ({ element }) => { 32 | return element.nextElementSibling 33 | ?.querySelector('.age') 34 | ?.getAttribute('title'); 35 | }, 36 | } 37 | }); 38 | 39 | console.log(JSON.stringify(results, null, 2)); 40 | }; 41 | 42 | main(); 43 | -------------------------------------------------------------------------------- /examples/typescript/src/managed-jsdom/scraper.ts: -------------------------------------------------------------------------------- 1 | import pino from 'pino' 2 | import { scrape, fetchPage, createWindow } from '@armand1m/papercut'; 3 | 4 | const main = async () => { 5 | const logger = pino({ 6 | name: 'Hacker News', 7 | enabled: false 8 | }); 9 | 10 | const rawHTML = await fetchPage('https://news.ycombinator.com/') 11 | const window = createWindow(rawHTML); 12 | 13 | const results = await scrape({ 14 | strict: true, 15 | logger, 16 | document: window.document, 17 | target: ".athing", 18 | selectors: { 19 | rank: (utils) => { 20 | const value = utils.text('.rank').replace(/^\D+/g, ''); 21 | return Number(value); 22 | }, 23 | name: ({ text }) => text('.titlelink'), 24 | url: ({ href }) => href('.titlelink'), 25 | score: ({ element }) => { 26 | return element.nextElementSibling?.querySelector('.score') 27 | ?.textContent; 28 | }, 29 | createdBy: ({ element }) => { 30 | return element.nextElementSibling?.querySelector('.hnuser') 31 | ?.textContent; 32 | }, 33 | createdAt: ({ element }) => { 34 | return element.nextElementSibling 35 | ?.querySelector('.age') 36 | ?.getAttribute('title'); 37 | }, 38 | }, 39 | options: { 40 | log: false, 41 | cache: true, 42 | concurrency: { 43 | page: 2, 44 | node: 2, 45 | selector: 2 46 | } 47 | } 48 | }); 49 | 50 | window.close(); 51 | 52 | console.log(JSON.stringify(results, null, 2)); 53 | }; 54 | 55 | main(); 56 | -------------------------------------------------------------------------------- /examples/typescript/src/myinstantaudios/scraper.ts: -------------------------------------------------------------------------------- 1 | import { createScraper } from '@armand1m/papercut'; 2 | 3 | const baseUrl = 'https://www.myinstants.com'; 4 | 5 | const main = async () => { 6 | const scraper = createScraper( 7 | { 8 | name: 'My Instants', 9 | options: { 10 | cache: true, 11 | } 12 | }, 13 | ); 14 | 15 | const results = await scraper.run({ 16 | strict: false, 17 | baseUrl, 18 | target: '.instant', 19 | pagination: { 20 | enabled: true, 21 | lastPageNumberSelector: '#results-pagination > li:nth-child(7)', 22 | createPaginatedUrl: (baseUrl, pageNumber) => { 23 | return `${baseUrl}/index/us/?page=${pageNumber}`; 24 | } 25 | }, 26 | selectors: { 27 | instantName: ({ text }) => text('.instant-link'), 28 | instantPageUrl: ({ href }) => 29 | `${baseUrl}${href('.instant-link')}`, 30 | instantSoundUrl: async (selectors, $this) => { 31 | const { fetchPage, createWindow } = selectors; 32 | const soundPageUrl = $this.instantPageUrl(selectors, $this); 33 | 34 | const soundHTML = await fetchPage(soundPageUrl); 35 | const soundWindow = createWindow(soundHTML); 36 | const soundDocument = soundWindow.document; 37 | 38 | const ogAudioMeta = soundDocument.querySelector( 39 | `meta[property="og:audio"]` 40 | ); 41 | const ogAudioTypeMeta = soundDocument.querySelector( 42 | `meta[property="og:audio:type"]` 43 | ); 44 | const soundUrl = ogAudioMeta.getAttribute('content'); 45 | const soundType = ogAudioTypeMeta.getAttribute('content'); 46 | 47 | soundWindow.close(); 48 | 49 | return { 50 | soundUrl, 51 | soundType, 52 | }; 53 | }, 54 | } 55 | }) 56 | 57 | console.log(JSON.stringify(results, null, 2)); 58 | }; 59 | 60 | main(); 61 | -------------------------------------------------------------------------------- /examples/typescript/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "pretty": true, 4 | "strict": true, 5 | "removeComments": true, 6 | "noImplicitReturns": true, 7 | "noImplicitAny": true, 8 | "noImplicitThis": true, 9 | "alwaysStrict": false, 10 | "strictFunctionTypes": true, 11 | "strictPropertyInitialization": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "diagnostics": true, 14 | "listEmittedFiles": true, 15 | "importHelpers": true, 16 | "noUnusedLocals": true, 17 | "esModuleInterop": true, 18 | "strictNullChecks": true, 19 | "experimentalDecorators": true, 20 | "isolatedModules": true, 21 | "skipLibCheck": true, 22 | "sourceMap": true, 23 | "resolveJsonModule": true, 24 | "declaration": true, 25 | "target": "es6", 26 | "module": "commonjs", 27 | "jsx": "preserve", 28 | "moduleResolution": "node", 29 | "outDir": "./build", 30 | "lib": ["es6", "es2015", "es2017", "esnext"], 31 | "typeRoots": [ 32 | "./node_modules/@types" 33 | ], 34 | "types": [ 35 | "node", 36 | "jest" 37 | ] 38 | }, 39 | "include": ["./src"], 40 | "exclude": [ 41 | "node_modules", 42 | "**/*.test.ts" 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.5", 3 | "license": "MIT", 4 | "main": "dist/index.js", 5 | "types": "dist/index.d.ts", 6 | "contributors": [ 7 | { 8 | "url": "https://armand1m.dev", 9 | "name": "Armando Magalhaes", 10 | "email": "armando.mag95@gmail.com" 11 | } 12 | ], 13 | "keywords": [ 14 | "nodejs", 15 | "scraper", 16 | "scraping", 17 | "jsdom", 18 | "crawler", 19 | "web-scraping", 20 | "typescript", 21 | "caching", 22 | "geosearching" 23 | ], 24 | "files": [ 25 | "dist" 26 | ], 27 | "engines": { 28 | "node": ">=16" 29 | }, 30 | "repository": { 31 | "type": "git", 32 | "url": "https://github.com/armand1m/papercut.git" 33 | }, 34 | "scripts": { 35 | "start": "tsdx watch", 36 | "build": "tsdx build", 37 | "test": "tsdx test --passWithNoTests", 38 | "test:types": "tsd", 39 | "lint": "tsdx lint", 40 | "docs": "typedoc ./src/index.ts", 41 | "pack:inspect": "yarn pack && tar -ztvf *.tgz", 42 | "generate:readme": "node ./scripts/generateReadme.js < template.md > README.md", 43 | "semantic-release": "semantic-release" 44 | }, 45 | "peerDependencies": { 46 | "jsdom": "^18.1.0", 47 | "pino": "^7.2.0" 48 | }, 49 | "husky": { 50 | "hooks": { 51 | "pre-commit": "tsdx lint" 52 | } 53 | }, 54 | "prettier": { 55 | "printWidth": 70, 56 | "semi": true, 57 | "singleQuote": true, 58 | "trailingComma": "es5" 59 | }, 60 | "name": "@armand1m/papercut", 61 | "description": "Papercut is a scraping/crawling library for Node.js, written in Typescript.", 62 | "author": "Armando Magalhães", 63 | "module": "dist/papercut.esm.js", 64 | "devDependencies": { 65 | "@semantic-release/changelog": "^5.0.1", 66 | "@semantic-release/git": "^9.0.0", 67 | "@semantic-release/github": "^7.2.3", 68 | "@semantic-release/npm": "^7.1.3", 69 | "@types/jsdom": "^16.2.3", 70 | "@types/lodash": "^4.14.158", 71 | "@types/node-fetch": "^2.5.7", 72 | "@types/node-localstorage": "^1.3.0", 73 | "@types/signale": "^1.4.1", 74 | "husky": "^4.2.5", 75 | "jest": "^27.3.1", 76 | "jsdom": "^18.1.0", 77 | "pino": "^7.2.0", 78 | "remark-code-import": "^0.3.0", 79 | "remark-contributors": "^5.1.0", 80 | "remark-gfm": "^1.0.0", 81 | "remark-parse": "^9.0.0", 82 | "remark-stringify": "^9.0.1", 83 | "semantic-release": "^17.4.4", 84 | "tsd": "^0.18.0", 85 | "tsdx": "^0.14.1", 86 | "typedoc": "^0.22.8", 87 | "typescript": "^4.4.4", 88 | "unified": "^9.2.1", 89 | "unified-stream": "^1.0.6", 90 | "unist-util-visit": "^2.0.1" 91 | }, 92 | "dependencies": { 93 | "@supercharge/promise-pool": "^1.5.0", 94 | "fp-ts": "^2.11.5", 95 | "lodash": "^4.17.19", 96 | "node-fetch": "^2.6.0", 97 | "node-localstorage": "^2.1.6" 98 | }, 99 | "resolutions": { 100 | "**/typescript": "^4.4.4", 101 | "**/prettier": "^2.4.1", 102 | "**/node-notifier": ">=8.0.1" 103 | }, 104 | "tsd": { 105 | "directory": "src" 106 | }, 107 | "jest": { 108 | "testEnvironment": "node", 109 | "modulePathIgnorePatterns": [ 110 | "/dist" 111 | ], 112 | "coveragePathIgnorePatterns": [ 113 | "/node_modules/", 114 | "/__fixtures__/" 115 | ], 116 | "collectCoverageFrom": [ 117 | "src/**/*.{ts,tsx}", 118 | "!src/**/*.test-d.ts", 119 | "!**/node_modules/**" 120 | ] 121 | }, 122 | "release": { 123 | "branches": [ 124 | "master" 125 | ], 126 | "plugins": [ 127 | "@semantic-release/commit-analyzer", 128 | "@semantic-release/release-notes-generator", 129 | [ 130 | "@semantic-release/changelog", 131 | { 132 | "changelogFile": "CHANGELOG.md" 133 | } 134 | ], 135 | "@semantic-release/npm", 136 | [ 137 | "@semantic-release/git", 138 | { 139 | "assets": [ 140 | "package.json", 141 | "README.md", 142 | "CHANGELOG.md", 143 | "docs" 144 | ], 145 | "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" 146 | } 147 | ], 148 | "@semantic-release/github" 149 | ] 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /scripts/codeImport.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const visit = require('unist-util-visit'); 4 | const { EOL } = require('os'); 5 | 6 | function extractLines(content, fromLine, hasDash, toLine) { 7 | if (fromLine === undefined && toLine === undefined) { 8 | return content; 9 | } 10 | const lines = content.split(EOL); 11 | const start = fromLine || 1; 12 | const end = hasDash ? toLine || lines.length : start; 13 | return lines.slice(start - 1, end).join('\n'); 14 | } 15 | 16 | function codeImport(options = { 17 | async: false 18 | }) { 19 | return function transformer(tree, file) { 20 | const codes = []; 21 | const promises = []; 22 | 23 | visit(tree, 'code', (node, index, parent) => { 24 | codes.push([node, index, parent]); 25 | }); 26 | 27 | for (const [node] of codes) { 28 | const fileMeta = (node.meta || '') 29 | .split(' ') 30 | .find(meta => meta.startsWith('file=')); 31 | 32 | if (!fileMeta) { 33 | continue; 34 | } 35 | 36 | const res = /^file=(?.+?)(?:(?:#(?:L(?\d+)(?-)?)?)(?:L(?\d+))?)?$/.exec( 37 | fileMeta 38 | ); 39 | if (!res || !res.groups || !res.groups.path) { 40 | throw new Error(`Unable to parse file path ${fileMeta}`); 41 | } 42 | const filePath = res.groups.path; 43 | const hasDash = !!res.groups.dash; 44 | const fromLine = res.groups.from 45 | ? parseInt(res.groups.from, 10) 46 | : undefined; 47 | const toLine = res.groups.to ? parseInt(res.groups.to, 10) : undefined; 48 | const fileAbsPath = path.resolve(file.cwd, filePath); 49 | 50 | if (options.async) { 51 | promises.push( 52 | new Promise((resolve, reject) => { 53 | fs.readFile(fileAbsPath, 'utf8', (err, fileContent) => { 54 | if (err) { 55 | reject(err); 56 | return; 57 | } 58 | 59 | node.value = extractLines( 60 | fileContent, 61 | fromLine, 62 | hasDash, 63 | toLine 64 | ).trim(); 65 | resolve(); 66 | }); 67 | }) 68 | ); 69 | } else { 70 | const fileContent = fs.readFileSync(fileAbsPath, 'utf8'); 71 | 72 | node.value = extractLines( 73 | fileContent, 74 | fromLine, 75 | hasDash, 76 | toLine 77 | ).trim(); 78 | } 79 | } 80 | 81 | if (promises.length) { 82 | return Promise.all(promises); 83 | } 84 | }; 85 | } 86 | 87 | module.exports = codeImport; 88 | -------------------------------------------------------------------------------- /scripts/generateReadme.js: -------------------------------------------------------------------------------- 1 | const unified = require('unified') 2 | const stream = require('unified-stream') 3 | const markdown = require('remark-parse') 4 | const stringify = require('remark-stringify') 5 | const contributors = require('remark-contributors') 6 | const gfm = require('remark-gfm'); 7 | const codeImport = require('./codeImport') 8 | const packageJson = require('../package.json'); 9 | 10 | const processor = unified() 11 | .use(markdown) 12 | .use(gfm) 13 | .use(contributors, { 14 | contributors: packageJson.contributors, 15 | }) 16 | .use(codeImport) 17 | .use(stringify) 18 | 19 | process.stdin.pipe(stream(processor)).pipe(process.stdout) 20 | -------------------------------------------------------------------------------- /src/http/fetchPage.ts: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch'; 2 | import { LocalStorage } from 'node-localstorage'; 3 | import { hash } from '../utilities/hash'; 4 | 5 | const cachePath = 6 | process.env.PAPERCUT_PAGE_CACHE_PATH ?? './pagecache'; 7 | 8 | const pagecache = new LocalStorage(cachePath, 30 * 1024 * 1024); 9 | 10 | export const fetchPage = async (url: string) => { 11 | const hashKey = hash(url); 12 | const cacheResponse = pagecache.getItem(hashKey); 13 | 14 | if (cacheResponse) { 15 | return cacheResponse; 16 | } 17 | 18 | const payload = await fetch(url).then((res) => res.text()); 19 | 20 | pagecache.setItem(hashKey, payload); 21 | 22 | return payload; 23 | }; 24 | -------------------------------------------------------------------------------- /src/http/geosearch.ts: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch'; 2 | import { LocalStorage } from 'node-localstorage'; 3 | import { hash } from '../utilities/hash'; 4 | 5 | const cachePath = 6 | process.env.PAPERCUT_GEOSEARCH_CACHE_PATH ?? './geocache'; 7 | 8 | const geocache = new LocalStorage(cachePath, 30 * 1024 * 1024); 9 | 10 | interface Location { 11 | place_id: number; 12 | licence: string; 13 | osm_type: string; 14 | osm_id: number; 15 | boundingbox: string[]; 16 | lat: string; 17 | lon: string; 18 | display_name: string; 19 | class: string; 20 | type: string; 21 | importance: number; 22 | icon: string; 23 | } 24 | 25 | export interface GeosearchResult { 26 | latitude: number; 27 | longitude: number; 28 | } 29 | 30 | export const geosearch = async (q: string, limit: number = 1) => { 31 | const hashKey = hash(q); 32 | const cacheResponse = geocache.getItem(hashKey); 33 | 34 | if (cacheResponse) { 35 | return JSON.parse(cacheResponse) as GeosearchResult; 36 | } 37 | 38 | const params = new URLSearchParams({ 39 | q, 40 | limit: Number(limit).toString(), 41 | format: 'json', 42 | }); 43 | 44 | const ENDPOINT = `https://nominatim.openstreetmap.org/search?${params.toString()}`; 45 | const payload: Location[] = await fetch(ENDPOINT).then((res) => 46 | res.json() 47 | ); 48 | 49 | if (!payload || !payload.length) { 50 | throw new Error(`No response for Address: ${q}`); 51 | } 52 | 53 | const result: GeosearchResult = { 54 | latitude: Number(payload[0].lat), 55 | longitude: Number(payload[0].lon), 56 | }; 57 | 58 | geocache.setItem(hashKey, JSON.stringify(result)); 59 | 60 | return result; 61 | }; 62 | -------------------------------------------------------------------------------- /src/index.test.ts: -------------------------------------------------------------------------------- 1 | import * as Index from './'; 2 | 3 | test('exports expected modules', () => { 4 | expect(Index.geosearch).toBeDefined(); 5 | expect(Index.scrape).toBeDefined(); 6 | expect(Index.createRunner).toBeDefined(); 7 | expect(Index.createScraper).toBeDefined(); 8 | expect(Index.createSelectorUtilities).toBeDefined(); 9 | }); 10 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export { createScraper } from './scraper/createScraper'; 2 | export type { 3 | Scraper, 4 | ScraperOptions, 5 | ScraperProps, 6 | } from './scraper/createScraper'; 7 | 8 | export { createRunner } from './scraper/createRunner'; 9 | export type { 10 | SelectorMap, 11 | SelectorFunction, 12 | CreateRunnerProps, 13 | RunProps, 14 | } from './scraper/createRunner'; 15 | 16 | export { scrape } from './scraper/scrape'; 17 | export type { ScrapeProps, ScrapeResultType } from './scraper/scrape'; 18 | 19 | export { createSelectorUtilities } from './selectors/createSelectorUtilities'; 20 | export type { SelectorUtilities } from './selectors/createSelectorUtilities'; 21 | 22 | export { geosearch } from './http/geosearch'; 23 | export type { GeosearchResult } from './http/geosearch'; 24 | 25 | export { fetchPage } from './http/fetchPage'; 26 | export { createWindow } from './utilities/createWindow'; 27 | -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0039f8b7ef5f473fc3c80643193b61f5227303cec6a724a9802a259bb5aef020: -------------------------------------------------------------------------------- 1 | {"latitude":52.3639288,"longitude":4.8962294} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/03f6865c192bedf244aa6c5ea85a932f49e1cadd235af6b1c57413606888978f: -------------------------------------------------------------------------------- 1 | {"latitude":52.3753268,"longitude":4.8934766} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0812462f763a02c1355c811fa8e8c3484084d701532c5e66b1b6eb85e88ad1b5: -------------------------------------------------------------------------------- 1 | {"latitude":52.3790679,"longitude":4.8938367} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0846132a14aa0ce8893952d4547bf4d5481aa1d71ed3b3b50b81efe833ab5d29: -------------------------------------------------------------------------------- 1 | {"latitude":52.3825524,"longitude":4.871201} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0865e061da0580ae96eb24647eebd4078d7359ed7480608b6a213749349aa14a: -------------------------------------------------------------------------------- 1 | {"latitude":52.3585898,"longitude":4.8592885} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/086f7bfc27a9c76968cd048d92dedb33e06ded64e33f820091c51f2d8408874a: -------------------------------------------------------------------------------- 1 | {"latitude":52.3767148,"longitude":4.9019324} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0a6b263a3db63b4c2ff99431a9e1481bafb90cb21aad916e428c0012e815df5f: -------------------------------------------------------------------------------- 1 | {"latitude":52.380066,"longitude":4.8929225} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/0c544ac3d4edab094567372665274966288cd8cd288b47e52ea94614f9c52311: -------------------------------------------------------------------------------- 1 | {"latitude":52.3651803,"longitude":4.9258144} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/10b3d7e47d8ea217e8a0fbe8eea70d91908cd407c178fd9acac02fd5a2f9786e: -------------------------------------------------------------------------------- 1 | {"latitude":52.378892,"longitude":4.893038} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/1495740b7f854575a94e51db8674fcc16c04d487ebc6b926b2f5db87840d6888: -------------------------------------------------------------------------------- 1 | {"latitude":52.369998,"longitude":4.900897} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/159c45a4e39526bd2f1ff5fdad16a68f6f736a637606cfac1ca381e2159e0434: -------------------------------------------------------------------------------- 1 | {"latitude":52.3548348,"longitude":4.8932984} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/16682a84168986c37c3616d2dd97d75bbd7e906a54bb27c8547655c8ee415c2e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3700049,"longitude":4.896309} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/18916a50c7bf475ffb0472fbfc93e77c5b8bb8d2487084631ee0402be724dc25: -------------------------------------------------------------------------------- 1 | {"latitude":52.3654872,"longitude":4.8534292} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/19233dc26e4a376cb27c1589699bbc020213389d94c3d798d531cb78c8c02ddf: -------------------------------------------------------------------------------- 1 | {"latitude":52.3669858,"longitude":4.8528824} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/1df29b9f95d1aecafb8705a8f77c730ea61e6d3c5f43dc57341c0a2d8080c578: -------------------------------------------------------------------------------- 1 | {"latitude":52.355811,"longitude":4.8918846} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/1f5330e42d87b16cf2bb2bbc4f1b36dcdf549bf862e925f329444f6ab11b9ad3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3794229,"longitude":4.8940052} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2168bbb398e97d02a86e146618af7abd08ef83c0370a19bfb363f167ed8f6f9b: -------------------------------------------------------------------------------- 1 | {"latitude":52.3806956,"longitude":4.8908886} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/21b842ae27089920ead9694ce43de0bd3b3a32e8b115e73a47ab458a10cbc984: -------------------------------------------------------------------------------- 1 | {"latitude":52.3554719,"longitude":4.9007255} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2590447e3b2e84d12febf10ad92e459ed3e7f642e64e89139a87a332b042216c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3742268,"longitude":4.8936043} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/276b07f17baf1ed948c646d915be4b24ed5958585ff641d837462eb04fa75dae: -------------------------------------------------------------------------------- 1 | {"latitude":52.371476,"longitude":4.895748} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/29476a326b54102cdcf21808c6c293d998424eccbb189c2238e2f26d6bab0e6e: -------------------------------------------------------------------------------- 1 | {"latitude":52.372668,"longitude":4.8494805} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2bf98eafbb327e0c92a670393799b5db96e2dacd59b044a28f68dc402bbfd0a9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3799353,"longitude":4.892811} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2e13e4794d7dba7bb42642d7559ac132c875e0dce30192dd497c5eef56df036c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3757041,"longitude":4.9001615} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2f10feb31b446cf536b4d21de782b40e0a71c20e8dce450567ef8928e4f0bdb7: -------------------------------------------------------------------------------- 1 | {"latitude":52.3700452,"longitude":4.8973658} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/2fc8ca6ef389738847f5f86c4e50a1179717ab97274582b72ce4445a1edb7aac: -------------------------------------------------------------------------------- 1 | {"latitude":52.3730673,"longitude":4.9000786} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/3b7022aa71348539fa1cee9459a6ea58725c8ffbb4e5c15398e4768e7db261ca: -------------------------------------------------------------------------------- 1 | {"latitude":52.3643312,"longitude":4.8850229} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/3ca975e1dd276b100f88537968077d6438b5659632a0c6d3ae5c5f83bd96e032: -------------------------------------------------------------------------------- 1 | {"latitude":52.3563981,"longitude":4.9162761} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/3cb066e01161d192b2a0465d1a2ab684af8d4f38972112ebccbf633e0350e484: -------------------------------------------------------------------------------- 1 | {"latitude":52.4019561,"longitude":4.8984972} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/3ce2a37f2f2528dd949553e85466a2728e22810f357843a5367058453d573aca: -------------------------------------------------------------------------------- 1 | {"latitude":52.3753547,"longitude":4.8973747} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/3f0f5debfa379c049135b9acad0401a3b93afb475fec5a66ea1a9542afb9f839: -------------------------------------------------------------------------------- 1 | {"latitude":52.362266,"longitude":4.8861579} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/43a1b6fd5cc0042a803465cff092510e7d1123174042dce331ca98b2956823c1: -------------------------------------------------------------------------------- 1 | {"latitude":52.3699089,"longitude":4.8820968} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/447112d290ca7c20d90de2da63021629c157086676f08d73be9e5ffde16edd5e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3699972,"longitude":4.8490427} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/44ac14bcab44f9d625144c88487af8a73bee2cc89d1c0d4a972b150a50b00930: -------------------------------------------------------------------------------- 1 | {"latitude":52.3742317,"longitude":4.8926204} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/47467a74141ec3dafc68ef5947ade64ca3442c5ed674648d69e29fa4d53086a9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3715629,"longitude":4.8758363} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/47b020185d20b21b9b78655fa1485df8baf672bb748cbb5c7ff3472ba931ab13: -------------------------------------------------------------------------------- 1 | {"latitude":52.36696,"longitude":4.8941494} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/49f30821ebf1efcb3e3fb9a0f44fb8a218cfad214ad50db76e61bfeefe57ec36: -------------------------------------------------------------------------------- 1 | {"latitude":52.3656175,"longitude":4.8852579} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/4baa8d744daa0f0e5b4096efd59f0284dd8a6c25019945cbaf1eba9a873369e6: -------------------------------------------------------------------------------- 1 | {"latitude":52.3668909,"longitude":4.8953892} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/4be34a1de09f84d141cda319bfba0ba35f0ba67e936b7635e0a8f03c6374da5c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3778862,"longitude":4.8093301} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/4d562820676ed7aa893f42423230ae456a1a37912194741ea6c1c9d9e4123f3b: -------------------------------------------------------------------------------- 1 | {"latitude":52.3560695,"longitude":4.8910319} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/4ee4f89da06c817cb80a6a037dd9138f7c18ac67199156d72f6e2d4f2c49e310: -------------------------------------------------------------------------------- 1 | {"latitude":52.3703176,"longitude":4.8894189} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/509877e1e1fd08ea12075ec2d733571e2f9d009c8b2a55fb9f926ae31704d589: -------------------------------------------------------------------------------- 1 | {"latitude":52.3665552,"longitude":4.8902393} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/52741613a7ee04eea5b8740920c426a1ffbf9c55ba27d97c35384424ed4e58a7: -------------------------------------------------------------------------------- 1 | {"latitude":52.3651124,"longitude":4.8955556} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/52b52c63469184633d1af1f56fd255b9683329264ed2ee11c52dad9a1a48ded3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3623061,"longitude":4.8575482} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/539ea28205ec567c2e7f72b6423ccf23ba9f6c81f5971c4ade843776e3e84bdb: -------------------------------------------------------------------------------- 1 | {"latitude":52.3530315,"longitude":4.8846713} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/53cd7d077a57c9b81622acbf09b628c88ae105eb911e93b70a2a0c66534374d6: -------------------------------------------------------------------------------- 1 | {"latitude":52.3722565,"longitude":4.875785} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/581d10f0b8842522dd78714995682c4c4a3c79ac14c78bd273e407dcb2702f57: -------------------------------------------------------------------------------- 1 | {"latitude":52.3745074,"longitude":4.8812359} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/5830a524893ffb18a1049e68550bde23f84f8c42272a80c6b935c9542011a9ff: -------------------------------------------------------------------------------- 1 | {"latitude":52.3725695,"longitude":4.8594229} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/5b77f6cf60383b79bad8206845892062812c3ca46941f055910b87a3fb3dd2d3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3745453,"longitude":4.8902379} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/5d84e577ac3e47a12ac74f950e6920a961ef2213bbbe70defa58379ab3ef221e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3606116,"longitude":4.866253} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/5ea874201efb2070f093965db13ac1cb0b40bbae58148957fbf393a957d625f0: -------------------------------------------------------------------------------- 1 | {"latitude":52.345061,"longitude":4.8641671} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/615c3c5585c1d738abf73c0a5509237049cec5bdf585b8d1d1f3894bdb8bf384: -------------------------------------------------------------------------------- 1 | {"latitude":52.3803036,"longitude":4.89217} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/61d8a19d6f9f13aede3bde2686ff536b7caeaaab4675dccc924ca08555c1395e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3717046,"longitude":4.8952505} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6212bbf0aff267cee30a12c98deb373a9bbc01211a9e45872ea205ae75e3c988: -------------------------------------------------------------------------------- 1 | {"latitude":52.3746905,"longitude":4.8888723} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6260ca5dcdcc7877fab7ace99fde01e6a7cf75a3b9fe9e6109dfa0e17042cd56: -------------------------------------------------------------------------------- 1 | {"latitude":52.3648554,"longitude":4.898009} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/63e0b19c9f94c8666c8b8e47a662e80e27e3e91db0694bd906d8091c85b778eb: -------------------------------------------------------------------------------- 1 | {"latitude":52.3700092,"longitude":4.8973513} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/64b05c14e6e5c699c3baa90289035105d716f8445acf67639bd372af94ca6962: -------------------------------------------------------------------------------- 1 | {"latitude":52.3923061,"longitude":4.9563296} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/66fe226ba471a9ed95d603e384251109834a45abc21f4b74d975fdfc7b1b7c16: -------------------------------------------------------------------------------- 1 | {"latitude":52.3888396,"longitude":4.8872877} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6a1accdd3a86ba9f8ea9150e9fbe3202613b7c09d1a6449d4ef4825b7fcc700c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3661306,"longitude":4.9298274} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6a2f3f92327a2d5e4855d522376a0471f477e64720c81e908af2496be887ba0a: -------------------------------------------------------------------------------- 1 | {"latitude":52.3821712,"longitude":4.8785639} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6a7e99f73e5a5a64997acb3f9b72172e12da2fc09fac20c6d19b670d4dfc2d99: -------------------------------------------------------------------------------- 1 | {"latitude":52.3687535,"longitude":4.8882265} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6c08309e8e935fdf8246d2b5d2eabd04b6ec2bcdaa9ac35b2810825ab0b8fb65: -------------------------------------------------------------------------------- 1 | {"latitude":52.3689918,"longitude":4.8592893} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6d37e5a901e039cdd6a3bb9d79467b2e3483c7f327629451588408dccd762370: -------------------------------------------------------------------------------- 1 | {"latitude":52.3523465,"longitude":4.9034616} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/6ee26c51cf92b520ed100633b505d2828700afc3f048de9e26c86ece9a6faa7b: -------------------------------------------------------------------------------- 1 | {"latitude":52.3635636,"longitude":4.8632801} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/732f627132f66d39d986f23d48e0dbaf917f787aaf502805317fea080845fd53: -------------------------------------------------------------------------------- 1 | {"latitude":52.3755551,"longitude":4.8890435} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/776ab3dbaa606a6f1617a5b69d06ff4fff9da4b141c52311ac1c5c54119e73cb: -------------------------------------------------------------------------------- 1 | {"latitude":52.3515868,"longitude":4.8945016} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/795e9d05e106330894fa39a092fa123fdf43bdd5e538529795124b45bb228ba6: -------------------------------------------------------------------------------- 1 | {"latitude":52.3616759,"longitude":4.8862418} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/7a85e2edf0c1accc7c510d710f370eb81ef8d3099d5f90a95247e660953b5f71: -------------------------------------------------------------------------------- 1 | {"latitude":52.3778364,"longitude":4.8959341} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/7be172f36224113c5bc72f1f9536b693cf8b2cfc16ebe80ed63e3ee10f6154ff: -------------------------------------------------------------------------------- 1 | {"latitude":52.3659665,"longitude":4.8934466} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/7c60636bc1026a3cd0f76321849377237492631439c7bb4715293a5186140223: -------------------------------------------------------------------------------- 1 | {"latitude":52.3668215,"longitude":4.8958891} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/7ef44a2aad2a5f3d1df9ddf4168fcd6e56301bb320b7a48a483d5368b1cf141a: -------------------------------------------------------------------------------- 1 | {"latitude":52.36665,"longitude":4.9022485} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/8040cede87d529be0b51b41cfcecd9bd2ad457e56bc37f51c5e85caaa9dc4dc9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3668283,"longitude":4.8949199} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/80d586dca48e19177775d45973cd70bd82ef0d09d7ff22f2ef2c89357eb2c56b: -------------------------------------------------------------------------------- 1 | {"latitude":52.3536957,"longitude":4.8871751} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/81a94845a1aa2fb212ce9b921d62bf6691cc6e02d18d48f243c1ac96f167fc8c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3736775,"longitude":4.8976779} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/82f1e31ad4ea57b73e33cacd668fafdba14649c492aec656bc239cdbdc5d9adb: -------------------------------------------------------------------------------- 1 | {"latitude":52.3934497,"longitude":4.9129591} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/847f389e6ed45c4756f5fb08d2b89b951713873d3bab29e5602c087e90e44daa: -------------------------------------------------------------------------------- 1 | {"latitude":52.3547848,"longitude":4.9222237} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/84d5858d72bd35ae8c8c39775a1871fc44a454900b64d5f6616ff443695cd006: -------------------------------------------------------------------------------- 1 | {"latitude":52.3726966,"longitude":4.8761177} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/886f6867352127d8d4f9d1a9232a3e5bf290a687ac7165ce2663ad438111b82b: -------------------------------------------------------------------------------- 1 | {"latitude":52.3642612,"longitude":4.8851485} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/88a69063a977ea7ce6738ad7b5516fa605a7ef4a156bae2b1c49165bd06b10cc: -------------------------------------------------------------------------------- 1 | {"latitude":52.3674871,"longitude":4.8639469} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/8e9bea954e6f2fd89b5397b1e90d9d1b1d5ad2bb8f336f42768a6036ddf05456: -------------------------------------------------------------------------------- 1 | {"latitude":52.3688767,"longitude":4.8537108} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/8ff420d6e8bb1b82440059d0d6258427d3b87717b0886328bfceb3fe6771fc48: -------------------------------------------------------------------------------- 1 | {"latitude":52.3705649,"longitude":4.8889467} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/921fd5cf3c36cb9c5f284badd6c2489d9ee09b5bd2ba4bcb6a6f94c9244e5fa3: -------------------------------------------------------------------------------- 1 | {"latitude":52.371476,"longitude":4.8891563} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/96df13de11537bf367a7820bb4e722751c3af0c5e18945e46dd7eecdcc5ea0d0: -------------------------------------------------------------------------------- 1 | {"latitude":52.3728238,"longitude":4.8998311} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/9c6c0d631d904f4c0f959930ddaa6d807af22262e6a55ebe9a2cc5ee7f040640: -------------------------------------------------------------------------------- 1 | {"latitude":52.3659011,"longitude":4.859517} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/a40223ee69b3934cf80093c95899d50ede2addf45231ddf93148604f959029b3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3718104,"longitude":4.8790205} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/a57fddb893880b0fa5563d853073b3169d79f6777f11dcf8d1d0c82e93a7ea3e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3677884,"longitude":4.8960666} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/a82b7aca1ac3b4708e9449cdfb837482eb493a3c94a175d1409e365a8ad097d8: -------------------------------------------------------------------------------- 1 | {"latitude":52.3642355,"longitude":4.882745556059698} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/a99f86419c42445e61caee13a8f09414df741caa96d35d8054a699313cfe4bfa: -------------------------------------------------------------------------------- 1 | {"latitude":52.3725137,"longitude":4.8962514} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/aaa2a7e15e6c3c59a4209e4f5f9ebb6b7ce103411cb6bbd3f834bd835d805d71: -------------------------------------------------------------------------------- 1 | {"latitude":52.3656552,"longitude":4.8964012} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/acf8fa2ef6767dc46746ac4c110ad753a2d2debbfb31860b64199e2de76b3353: -------------------------------------------------------------------------------- 1 | {"latitude":52.3652516,"longitude":4.8858642} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ad918b351a653069232d6e175ebc676ed1bb2e9df85a77d356bc20609f008e39: -------------------------------------------------------------------------------- 1 | {"latitude":52.4115426,"longitude":4.9222305} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ae4cc26f6fd0e3d4bca9c07a025f80404f99ba5fcbbeaf2df03cd60410895286: -------------------------------------------------------------------------------- 1 | {"latitude":52.3651426,"longitude":4.9279633} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/afae2cb5f1dd188f6a11c4e3d22c4534e75ecfa0283a38eba2f59004e368c052: -------------------------------------------------------------------------------- 1 | {"latitude":52.3791277,"longitude":4.8793332} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b059a3db66fe137c46dc97bdce050f918bc37f928c84b25a57e7db71a96bea55: -------------------------------------------------------------------------------- 1 | {"latitude":52.3790237,"longitude":4.8937968} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b234601b6064291f6b38ba4cfa54bdf283706a18b352bd03c10a53246df8620f: -------------------------------------------------------------------------------- 1 | {"latitude":52.3734139,"longitude":4.8883416} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b41c0a1e6a8e3d75b5e372ea64ff5bc6398e7333593b1783ee0ca17915ee7b2b: -------------------------------------------------------------------------------- 1 | {"latitude":52.357249,"longitude":4.9012821} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b473e52ed1608c361af24df03d99b8a7301a8b4f3de6fa26c0a12af26edb86b3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3694651,"longitude":4.8823176} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b509fb74d9652dfd3ac88a193e15d9b3ce3787ac0aadaff3944d4fef55209ec5: -------------------------------------------------------------------------------- 1 | {"latitude":52.3759804,"longitude":4.8959932} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/b9dbba925241dd16c32f89c45fe471171b4c68a83774e016a6ba522925376878: -------------------------------------------------------------------------------- 1 | {"latitude":52.3704054,"longitude":4.8986812} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/bc526e33f9dc7bcde0ae23acbfb37b96daf1031613dbb644b3ba4e96fa8719b7: -------------------------------------------------------------------------------- 1 | {"latitude":52.3667407,"longitude":4.8959047} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/bd07bfca38af1872baa418c3d14c03c44cd4b4c8efa8074b05dd1a963c2626f9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3702203,"longitude":4.9299319} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/be55b63a12c4b9157f7e2b4327b28832347dbc5896757ca0448d633ad006fb94: -------------------------------------------------------------------------------- 1 | {"latitude":52.3821894,"longitude":4.8871855} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/bf978ffabf761580fb03df4b7a20b7158f2be8cedf4de380f20e4da67e94932e: -------------------------------------------------------------------------------- 1 | {"latitude":52.3763525,"longitude":4.9020334} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/bfee406d72533b4dd03f274c1f3096cba91846295f2d327cae5f13bf1c87fe3d: -------------------------------------------------------------------------------- 1 | {"latitude":52.362266,"longitude":4.8861579} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/c23e066a9ce8b5046d5f5560c5ac44e55c153ff4f456e4c0e574d3d4d7836d12: -------------------------------------------------------------------------------- 1 | {"latitude":52.367449,"longitude":4.8620091} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/c32fcc15a3791eb9cbc79de83e60259e70226750a5a97f45a865a985c983ae3c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3677317,"longitude":4.8904809} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/c3cd9d3e5a8bb951df50701a44ebc3d1abc0cbeeaa2a580eaaaf4c56c6f5ae4f: -------------------------------------------------------------------------------- 1 | {"latitude":52.3750993,"longitude":4.8916121} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/c5e0dd800d036c473a2ab80d849dd82849e2343df88d1287cfa2d651b1596f85: -------------------------------------------------------------------------------- 1 | {"latitude":52.3564155,"longitude":4.8955743} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/c768c09aa5fbacfc136d04b74c779a5f659e77936f590d832adca32d0200cb49: -------------------------------------------------------------------------------- 1 | {"latitude":52.3728506,"longitude":4.8998166} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ca63cc90b22f649ffeb20c63489bec1f5d8f604a53ee38199f5e377c06db195c: -------------------------------------------------------------------------------- 1 | {"latitude":52.3688485,"longitude":4.8847515} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ccf85a150785ed391d454d5b5440500b3edd1681cc5423a045847439a8ff76de: -------------------------------------------------------------------------------- 1 | {"latitude":52.3662607,"longitude":4.8993897} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/cdbbddd466905a01c2ecf0ddd635ca614d373e969aaf9d5e1011cdc9bc27d9be: -------------------------------------------------------------------------------- 1 | {"latitude":52.3781714,"longitude":4.8829411} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/d0122eab7c54f3994157eb0a1fc4ee7971b6591e964790e1ca5acb28aa9de817: -------------------------------------------------------------------------------- 1 | {"latitude":52.3667845,"longitude":4.8892776} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/d3caf3820e69ac74dbec4de80472dd83bb11b2efd998e6b6783affd233d149d6: -------------------------------------------------------------------------------- 1 | {"latitude":52.3435253,"longitude":4.86442} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/d9dc442d9427d056c1d44ff2ca97e7677eeb551e039bce144cfb6cf0af9d02ad: -------------------------------------------------------------------------------- 1 | {"latitude":52.3727074,"longitude":4.8786137} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/dc9ef75a046a575921382818ba2c917c97a81f1475e18cbe4c16b8dbc8a7ed94: -------------------------------------------------------------------------------- 1 | {"latitude":52.3559996,"longitude":4.9088087} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/dd13dddae0dfa2cde11eb0a9cb67b9164586bf630aafcb405d415b7a5a163fc9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3697849,"longitude":4.9009948} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/de29e424a2e0a0da1b6b7a4154723e7db4d55c486687d018158e9081430eefca: -------------------------------------------------------------------------------- 1 | {"latitude":52.3654806,"longitude":4.8975372} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/df4d08a52efcb2490d6510215be76551e359ff8fa4cc9e3388c685774f90a1cd: -------------------------------------------------------------------------------- 1 | {"latitude":52.3823631,"longitude":4.8730538} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e1228869bcfe79268095e182010ca2da780bded8163c5106e7d06e22ef934de5: -------------------------------------------------------------------------------- 1 | {"latitude":52.356921,"longitude":4.8091398} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e24e02f608ba963bcbfa11e0d450c78143a52c1eae53a6d33bf110f136cd35b6: -------------------------------------------------------------------------------- 1 | {"latitude":52.3533323,"longitude":4.899703} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e253b14111fd8c2dd92c7d8b042a006d9587c025600ac3ef809b9458319e8752: -------------------------------------------------------------------------------- 1 | {"latitude":52.3298234,"longitude":4.8982014} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e659a0525d32ee2d8ebf6e43918ba1c6d91e0ce0aa71e1df444c00b3d67c04d9: -------------------------------------------------------------------------------- 1 | {"latitude":52.3454885,"longitude":4.8579261} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e6a0fcbf751aff8b2b7466ad861e4610f0a536542865185c94537c0f1bb991dc: -------------------------------------------------------------------------------- 1 | {"latitude":52.3720587,"longitude":4.8911669} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/e8d781d3658104ce9cf35ce4a65c22265af530eca2fa8eca63086d9582f17c13: -------------------------------------------------------------------------------- 1 | {"latitude":52.3768482,"longitude":4.8972025} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/eae2617293c7b01db0c159cb34af9330e5be00ab6fcc41814f604aae9201547f: -------------------------------------------------------------------------------- 1 | {"latitude":52.3693527,"longitude":4.8929391} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ecb15a2b447ab52a3e10899b3c8b99a419cafab02f16b9285fe4afe4a679faa3: -------------------------------------------------------------------------------- 1 | {"latitude":52.3454885,"longitude":4.8579261} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/efc7ff96ccf15cd169249056c0095ab5dbf94a78c1bb3b674721e7c6cdfd4c64: -------------------------------------------------------------------------------- 1 | {"latitude":52.3718645,"longitude":4.9026614} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/efec83556909f26863444c428582f21b8b6794d07bc21c486bdda5288aa68f9a: -------------------------------------------------------------------------------- 1 | {"latitude":52.3814043,"longitude":4.8862981} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/f33f84d975513b59cf3c05af6dbce3e31c99549288fd8aaad7cfe2a2821d70b5: -------------------------------------------------------------------------------- 1 | {"latitude":52.368831,"longitude":4.888386} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/fa1b24d9aa017ffd09e2f01b104be89f6eff1a2ff28b6f8383fd2e56f337305f: -------------------------------------------------------------------------------- 1 | {"latitude":52.3657218,"longitude":4.880148} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/fadead3afed615bb416c091586819d45ba96a94a429e44f6a21ecae61c43e9b4: -------------------------------------------------------------------------------- 1 | {"latitude":52.364748399999996,"longitude":4.941496018243107} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ff1386526e44cfb16ee23bcc03959b8f2f973606c884d4b4d2f7560c81412001: -------------------------------------------------------------------------------- 1 | {"latitude":52.3587518,"longitude":4.9410238} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testgeocache/ffeccce77c62cfd793f1854b5b905117bd00a79de4fb642f0d6b31bf366ea1c7: -------------------------------------------------------------------------------- 1 | {"latitude":52.3696493,"longitude":4.8524867} -------------------------------------------------------------------------------- /src/scraper/__fixtures__/testpagecache/0f63a2a5a5620b745938e6a248b49704e580e3c8d9d24a4fb7ec68461f1edd41: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Hacker News
5 | 11 | 136 | 146 |
6 | 10 |
Hacker News 7 | new | past | comments | ask | show | jobs | submit 8 | login 9 |
12 | 13 | 15 | 16 | 17 | 19 | 20 | 21 | 23 | 24 | 25 | 27 | 28 | 29 | 31 | 32 | 33 | 35 | 36 | 37 | 39 | 40 | 41 | 43 | 44 | 45 | 47 | 48 | 49 | 51 | 52 | 53 | 55 | 56 | 57 | 59 | 60 | 61 | 63 | 64 | 65 | 67 | 68 | 69 | 71 | 72 | 73 | 75 | 76 | 77 | 79 | 80 | 81 | 83 | 84 | 85 | 87 | 88 | 89 | 91 | 92 | 93 | 95 | 96 | 97 | 99 | 100 | 101 | 103 | 104 | 105 | 107 | 108 | 109 | 111 | 112 | 113 | 115 | 116 | 117 | 119 | 120 | 121 | 123 | 124 | 125 | 127 | 128 | 129 | 131 | 132 | 133 | 134 |
1. U.S. states file updated antitrust complaint against Google (reuters.com)
14 | 127 points by mancerayder 2 hours ago | hide | 39 comments
2. Hoax email blast abused poor coding in FBI website (krebsonsecurity.com)
18 | 184 points by todsacerdoti 4 hours ago | hide | 44 comments
3. The case of the 500-mile email (2002) (ibiblio.org)
22 | 201 points by thunderbong 6 hours ago | hide | 35 comments
4. Reverse-engineering the Yamaha DX7 synthesizer's sound chip from die photos (righto.com)
26 | 311 points by picture 9 hours ago | hide | 108 comments
5. Study finds a difference between neurons of humans and other mammals (news.mit.edu)
30 | 64 points by dootah 4 hours ago | hide | 21 comments
6. Dataminers are finding developer comments and unlicensed songs in GTA Trilogy (pcgamer.com)
34 | 73 points by bobitsaboy 5 hours ago | hide | 29 comments
7. In a ‘learning trap’ experiment, adults leap conclusions while children explore (wsj.com)
38 | 87 points by jkuria 5 hours ago | hide | 38 comments
8. Richard ‘Lowtax’ Kyanka, founder of Something Awful, has died (vice.com)
42 | 43 points by elsewhen 3 hours ago | hide | 22 comments
9. How Michigan grew its startup ecosystem (timesofe.com)
46 | 95 points by rmason 6 hours ago | hide | 50 comments
10. Restic – Backups Done Right (restic.net)
50 | 424 points by IceWreck 13 hours ago | hide | 226 comments
11. Vaccum-chamber quantum sensor device could allow GPS-free navigation (breakingdefense.com)
54 | 53 points by tomohawk 4 hours ago | hide | 30 comments
12. Show HN: Stringmark – Chrome extension that makes online research easier (stringmark.com)
58 | 68 points by johanznsdn 6 hours ago | hide | 16 comments
13. Spotify Codes – Part 2 (boonepeter.github.io)
62 | 36 points by healeycodes 5 hours ago | hide | 7 comments
14. Ask HN: Advice for moving on from a failed startup?
66 | 59 points by throwaway9838 5 hours ago | hide | 37 comments
15. Breaking down the 'payment for order flow' debate (a16z.com)
70 | 17 points by ali92hm 2 hours ago | hide | 12 comments
16. Show HN: Plain Org – a new org mode app for iOS (apps.apple.com)
74 | 32 points by xenodium 5 hours ago | hide | 20 comments
17. Managing an external display on Linux shouldn’t be this hard (complete.org)
78 | 68 points by todsacerdoti 9 hours ago | hide | 53 comments
18. Substack (YC W18) is hiring engineering managers to build the future of writing (lever.co)
82 | 5 hours ago | hide
19. When there seems to be no way out – customer discovery for your head (steveblank.com)
86 | 42 points by sblank 6 hours ago | hide | 3 comments
20. The weirdest bug I've ever encountered (mental-reverb.com)
90 | 81 points by tjalfi 9 hours ago | hide | 18 comments
21. In the 17th century, Leibniz dreamed of a machine that could calculate ideas (ieee.org)
94 | 91 points by malshe 12 hours ago | hide | 38 comments
22. Show HN: Marvin – A grumpy Slackbot who doesn't like your ideas (github.com/keltrycroft)
98 | 94 points by krycroft 12 hours ago | hide | 19 comments
23. Metaobject protocols: Why we want them and what else they can do (1993) [pdf] (duke.edu)
102 | 38 points by Jtsummers 8 hours ago | hide | 17 comments
24. The strong and weak forces of architecture (martinfowler.com)
106 | 25 points by joeyespo 5 hours ago | hide | 9 comments
25. Project Euler (projecteuler.net)
110 | 251 points by tosh 9 hours ago | hide | 102 comments
26. Dte: A language for expressing and calculating date and time (github.com/mvrozanti)
114 | 48 points by nixcraft 8 hours ago | hide | 17 comments
27. Gin, television, and social surplus, or, “looking for the mouse” (2008) (gist.github.com)
118 | 96 points by nz 11 hours ago | hide | 14 comments
28. Show HN: Relative Bookmarks Browser Extension (github.com/duiker101)
122 | 22 points by duiker101 5 hours ago | hide | 3 comments
29. Sign arbitrary data with your SSH keys (agwa.name)
126 | 548 points by h1x 17 hours ago | hide | 281 comments
30. Chemists discover new way to harness energy from ammonia (phys.org)
130 | 55 points by theduder99 11 hours ago | hide | 27 comments
135 |

Guidelines 137 | | FAQ 138 | | Lists 139 | | API 140 | | Security 141 | | Legal 142 | | Apply to YC 143 | | Contact

Search: 144 |
145 |
147 | -------------------------------------------------------------------------------- /src/scraper/createLogger.ts: -------------------------------------------------------------------------------- 1 | import pino from 'pino'; 2 | 3 | export interface CreateLoggerProps { 4 | name: pino.LoggerOptions['name']; 5 | enabled?: pino.LoggerOptions['enabled']; 6 | } 7 | 8 | export type Logger = pino.Logger; 9 | 10 | export const createLogger = ({ 11 | name, 12 | enabled, 13 | }: CreateLoggerProps): Logger => { 14 | const logger = pino({ 15 | name, 16 | enabled, 17 | }); 18 | 19 | return logger; 20 | }; 21 | -------------------------------------------------------------------------------- /src/scraper/createRunner.ts: -------------------------------------------------------------------------------- 1 | import range from 'lodash/range'; 2 | import { pipe } from 'fp-ts/function'; 3 | import { fromNullable, match } from 'fp-ts/Option'; 4 | import PromisePool from '@supercharge/promise-pool/dist'; 5 | 6 | import { fetchPage } from '../http/fetchPage'; 7 | import { flat } from '../utilities/flat'; 8 | import { createWindow } from '../utilities/createWindow'; 9 | import { SelectorUtilities } from '../selectors/createSelectorUtilities'; 10 | 11 | import { scrape } from './scrape'; 12 | import { Logger } from './createLogger'; 13 | import { ScraperOptions } from './createScraper'; 14 | 15 | /** 16 | * Map of selector functions. 17 | * 18 | * This type is meant to be checked with an extended type, 19 | * as users are going to implement a derived version of this 20 | * for custom scrapers. 21 | */ 22 | export type SelectorMap = Record; 23 | 24 | /** 25 | * Function to be used when scraping the target node 26 | * for specific data. 27 | */ 28 | export type SelectorFunction = ( 29 | /** 30 | * Scraping utilities offered by papercut. 31 | */ 32 | utils: SelectorUtilities, 33 | 34 | /** 35 | * A reference to the selector map in which 36 | * this selector function is being implemented. 37 | * 38 | * Handy when you want to reuse logic 39 | * from another selector. 40 | */ 41 | self: SelectorMap 42 | ) => any; 43 | 44 | export interface PaginationOptions { 45 | /** 46 | * Enables pagination. 47 | * @default false 48 | */ 49 | enabled: boolean; 50 | 51 | /** 52 | * Function with custom logic to build 53 | * the paginated url for a specific page number. 54 | */ 55 | createPaginatedUrl: (baseUrl: string, pageNumber: number) => string; 56 | 57 | /** 58 | * DOM selector to fetch the last page number 59 | * from the page being scraped. 60 | */ 61 | lastPageNumberSelector: string; 62 | } 63 | 64 | export interface CreateRunnerProps { 65 | /** 66 | * A pino.Logger instance. 67 | */ 68 | logger: Logger; 69 | /** 70 | * The scraper options. 71 | * Use this to tweak log, cache and concurrency settings. 72 | */ 73 | options: ScraperOptions; 74 | } 75 | 76 | export interface RunProps { 77 | /** 78 | * If enabled, this will make Papercut scrape the page in strict mode. 79 | * This means that in case a selector function fails, the entire scraping will 80 | * be halted with an error. 81 | * 82 | * When enabled, the result types will **not** expect undefined values. 83 | */ 84 | strict: B; 85 | 86 | /** 87 | * The base url to start scraping off. 88 | * 89 | * This page will be fetched, parsed and mounted in a virtual JSDOM instance. 90 | */ 91 | baseUrl: string; 92 | 93 | /** 94 | * The DOM selector for the target nodes to be scraped. 95 | */ 96 | target: string; 97 | 98 | /** 99 | * The selectors to be used during the scraping process. 100 | * 101 | * The result object will match the schema of the selectors. 102 | */ 103 | selectors: T; 104 | 105 | /** 106 | * Optional pagination feature. 107 | * 108 | * If enabled and configured, this will make papercut 109 | * fetch, parse, mount and scrape multiple pages based 110 | * on a URL creation pattern. 111 | * 112 | * As long as you have a way to fetch the last page number 113 | * from the page you're scraping, and use it as a query param 114 | * in the page url, you should be fine. 115 | */ 116 | pagination?: PaginationOptions; 117 | } 118 | 119 | /** 120 | * Creates a runner instance. 121 | * 122 | * This method is called by the createScraper function, 123 | * but can also be externally used if needed to use an 124 | * external pino logger or prefer full control over 125 | * the scraper options. 126 | * 127 | * @param props The runner logger and options. 128 | */ 129 | export const createRunner = ({ 130 | logger, 131 | options, 132 | }: CreateRunnerProps) => { 133 | /** 134 | * The scraper runner. 135 | * 136 | * When executed, it will fetch the base url and 137 | * build a JSDOM using the received HTML payload 138 | * in order to make a virtual window and document 139 | * available for scraping. 140 | * 141 | * Once these are ready, the scraper will start to 142 | * spawn promise pools to deal with more intensive 143 | * tasks, such as pagination, node scraping and 144 | * selector scraping in parallel. 145 | * 146 | * All these settings will depend on the options given 147 | * during the creation of the scraper struct. 148 | * 149 | * @typeParam T A mapped type based on the given selectors. 150 | * @typeParam B The strict mode boolean type. Used to tweak the scrape result type strictness. 151 | * @param props The scraping runner properties and selectors. 152 | * @returns result Type-safe scraping results based on the given selectors and strict mode. 153 | */ 154 | const run = async ({ 155 | strict, 156 | baseUrl, 157 | target, 158 | selectors, 159 | pagination, 160 | }: RunProps) => { 161 | logger.info('Fetching main page...'); 162 | 163 | const mainPageHTML = await fetchPage(baseUrl); 164 | 165 | logger.info('Parsing main page...'); 166 | 167 | const mainPageWindow = createWindow(mainPageHTML); 168 | 169 | logger.info('Starting scraping process...'); 170 | 171 | const results = await pipe( 172 | fromNullable( 173 | getLastPageNumberFromDocument( 174 | mainPageWindow.document, 175 | pagination 176 | ) 177 | ), 178 | match( 179 | async () => { 180 | logger.info( 181 | 'Unable to find last page number. Scraping main page only.' 182 | ); 183 | 184 | const mainPageResults = await scrape({ 185 | strict, 186 | target, 187 | document: mainPageWindow.document, 188 | selectors, 189 | logger, 190 | options, 191 | }); 192 | 193 | mainPageWindow.close(); 194 | 195 | return mainPageResults; 196 | }, 197 | async (lastPageNumber) => { 198 | logger.info(`Found ${lastPageNumber} pages`); 199 | 200 | const createPaginatedUrl = pagination?.createPaginatedUrl; 201 | 202 | if (!createPaginatedUrl) { 203 | throw new Error( 204 | 'Please define a function to help papercut create the paginated url in the pagination options.' 205 | ); 206 | } 207 | 208 | const pageNumbers = range(1, lastPageNumber); 209 | 210 | const { results, errors } = 211 | await PromisePool.withConcurrency( 212 | options.concurrency.page 213 | ) 214 | .for(pageNumbers) 215 | .process(async (pageNumber: number) => { 216 | logger.info(`Fetching page no. ${pageNumber}`); 217 | 218 | const pagePayload = await fetchPage( 219 | createPaginatedUrl(baseUrl, pageNumber) 220 | ); 221 | 222 | logger.info(`Parsing page no. ${pageNumber}`); 223 | 224 | const pageWindow = createWindow(pagePayload); 225 | 226 | logger.info(`Scraping page no. ${pageNumber}`); 227 | 228 | const pageResult = await scrape({ 229 | strict, 230 | target, 231 | document: pageWindow.document, 232 | selectors, 233 | logger, 234 | options, 235 | }); 236 | 237 | pageWindow.close(); 238 | 239 | return pageResult; 240 | }); 241 | 242 | if (errors) { 243 | logger.error( 244 | 'Some scraping requests failed with errors.' 245 | ); 246 | logger.error(errors); 247 | } 248 | 249 | mainPageWindow.close(); 250 | 251 | return flat(results); 252 | } 253 | ) 254 | ); 255 | 256 | return results; 257 | }; 258 | 259 | return run; 260 | }; 261 | 262 | const getLastPageNumberFromDocument = ( 263 | document: Document, 264 | options?: PaginationOptions 265 | ): number | undefined => { 266 | if (!options?.enabled) { 267 | return; 268 | } 269 | 270 | const lastPageNumberElement = document.querySelector( 271 | options.lastPageNumberSelector 272 | ); 273 | 274 | if (!lastPageNumberElement) { 275 | throw new Error( 276 | `Failed to find last page number using the given selector: "${options.lastPageNumberSelector}"` 277 | ); 278 | } 279 | 280 | const content = lastPageNumberElement.textContent; 281 | const possibleLastPageNumber = Number(content); 282 | 283 | if (isNaN(possibleLastPageNumber)) { 284 | throw new Error( 285 | `Failed to parse last page number using content found on given selector. Found "${content}" for selector "${options.lastPageNumberSelector}"` 286 | ); 287 | } 288 | 289 | return possibleLastPageNumber; 290 | }; 291 | -------------------------------------------------------------------------------- /src/scraper/createScraper.test.ts: -------------------------------------------------------------------------------- 1 | import { orderBy } from 'lodash'; 2 | import path from 'path'; 3 | 4 | process.env.PAPERCUT_PAGE_CACHE_PATH = path.resolve( 5 | __dirname, 6 | './__fixtures__/testpagecache' 7 | ); 8 | 9 | process.env.PAPERCUT_GEOSEARCH_CACHE_PATH = path.resolve( 10 | __dirname, 11 | './__fixtures__/testgeocache' 12 | ); 13 | 14 | test('createScraper - Single page (strict mode off)', async () => { 15 | const { createScraper } = await import('./createScraper'); 16 | const scraper = createScraper({ 17 | name: `Hacker News`, 18 | options: { 19 | log: false, 20 | cache: true, 21 | }, 22 | }); 23 | 24 | const results = await scraper.run({ 25 | strict: false, 26 | baseUrl: 'https://news.ycombinator.com/', 27 | target: '.athing', 28 | selectors: { 29 | rank: (utils) => { 30 | const value = utils.text('.rank').replace(/^\D+/g, ''); 31 | return Number(value); 32 | }, 33 | name: ({ text }) => text('.titlelink'), 34 | url: ({ href }) => href('.titlelink'), 35 | score: ({ element }) => { 36 | return element.nextElementSibling?.querySelector('.score') 37 | ?.textContent; 38 | }, 39 | createdBy: ({ element }) => { 40 | return element.nextElementSibling?.querySelector('.hnuser') 41 | ?.textContent; 42 | }, 43 | createdAt: ({ element }) => { 44 | return element.nextElementSibling 45 | ?.querySelector('.age') 46 | ?.getAttribute('title'); 47 | }, 48 | failingOnPurpose: () => { 49 | throw new Error('nope'); 50 | }, 51 | }, 52 | }); 53 | 54 | expect(results).toMatchSnapshot(); 55 | }); 56 | 57 | test('createScraper - Single page (strict mode on)', async () => { 58 | const { createScraper } = await import('./createScraper'); 59 | const scraper = createScraper({ 60 | name: `Hacker News`, 61 | options: { 62 | log: false, 63 | cache: true, 64 | }, 65 | }); 66 | 67 | const results = await scraper.run({ 68 | strict: true, 69 | baseUrl: 'https://news.ycombinator.com/', 70 | target: '.athing', 71 | selectors: { 72 | rank: (utils) => { 73 | const value = utils.text('.rank').replace(/^\D+/g, ''); 74 | return Number(value); 75 | }, 76 | name: ({ text }) => text('.titlelink'), 77 | url: ({ href }) => href('.titlelink'), 78 | score: ({ element }) => { 79 | return element.nextElementSibling?.querySelector('.score') 80 | ?.textContent; 81 | }, 82 | createdBy: ({ element }) => { 83 | return element.nextElementSibling?.querySelector('.hnuser') 84 | ?.textContent; 85 | }, 86 | createdAt: ({ element }) => { 87 | return element.nextElementSibling 88 | ?.querySelector('.age') 89 | ?.getAttribute('title'); 90 | }, 91 | }, 92 | }); 93 | 94 | expect(results).toMatchSnapshot(); 95 | }); 96 | 97 | test('createScraper - Pagination', async () => { 98 | jest.setTimeout(120_000); 99 | const { createScraper } = await import('./createScraper'); 100 | 101 | const createLabeledUrl = (label: string, url: string) => ({ 102 | label, 103 | url, 104 | }); 105 | 106 | const scraper = createScraper({ 107 | name: 'Amsterdam Coffeeshops', 108 | options: { 109 | cache: true, 110 | }, 111 | }); 112 | 113 | const results = await scraper.run({ 114 | strict: true, 115 | target: '.summary-box', 116 | baseUrl: 117 | 'https://amsterdamcoffeeshops.com/search/item/coffeeshops', 118 | pagination: { 119 | enabled: true, 120 | lastPageNumberSelector: 121 | '.navigation > .pagination > li:nth-child(8) > a', 122 | createPaginatedUrl: (baseUrl, pageNumber) => { 123 | return `${baseUrl}/p:${pageNumber}`; 124 | }, 125 | }, 126 | selectors: { 127 | name: ({ text }) => { 128 | return text('.media-body > h3 > a'); 129 | }, 130 | description: ({ text }) => { 131 | return text('.media-body > .summary-desc'); 132 | }, 133 | photo: ({ src }) => { 134 | return { url: src('.media-left > a > img') }; 135 | }, 136 | phone: ({ text }) => { 137 | return text('.media-right > .contact-info > mark > a'); 138 | }, 139 | address: ({ text }) => { 140 | const address = text('.media-body > address > p'); 141 | 142 | if (!address) { 143 | return undefined; 144 | } 145 | 146 | return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, ''); 147 | }, 148 | location: async (selectors, $this) => { 149 | const address = $this.address(selectors, $this); 150 | return selectors.geosearch(address); 151 | }, 152 | social: ({ href }) => { 153 | const websiteHref = href('.visit-website'); 154 | return websiteHref 155 | ? [createLabeledUrl('Official Website', websiteHref)] 156 | : []; 157 | }, 158 | menus: () => { 159 | /** TODO: scrape menus */ 160 | return []; 161 | }, 162 | badges: ({ all }) => { 163 | const { asArray: badges } = all( 164 | '.media-left > div > div > img' 165 | ); 166 | 167 | return badges 168 | .map((badge) => badge.getAttribute('title')) 169 | .filter((badge) => badge !== undefined) as string[]; 170 | }, 171 | rating: ({ className }) => { 172 | const rateNumber = className( 173 | '.media-right > .summary-info > span > span' 174 | ); 175 | 176 | if (!rateNumber) { 177 | return 0; 178 | } 179 | 180 | return Number(rateNumber.replace('rate-', '')); 181 | }, 182 | }, 183 | }); 184 | 185 | const sortedResults = orderBy( 186 | results, 187 | [(coffeeshop) => coffeeshop.name.toLowerCase()], 188 | ['asc'] 189 | ); 190 | 191 | expect(sortedResults).toMatchSnapshot(); 192 | }); 193 | -------------------------------------------------------------------------------- /src/scraper/createScraper.ts: -------------------------------------------------------------------------------- 1 | import { createLogger } from './createLogger'; 2 | import { createRunner } from './createRunner'; 3 | 4 | export interface ScraperProps { 5 | /** 6 | * The scraper name. 7 | * This will be used only for logging purposes. 8 | */ 9 | name: string; 10 | /** 11 | * The scraper options. 12 | * Use this to tweak log, cache and concurrency settings. 13 | */ 14 | options?: Partial; 15 | } 16 | 17 | export interface ScraperOptions { 18 | /** 19 | * Enables writing pino logs to the stdout. 20 | * @default process.env.DEBUG === "true" 21 | */ 22 | log: boolean; 23 | /** 24 | * Enables HTML payload caching on the disk. 25 | * Keep in mind that papercut **will not** clear the cache for you. 26 | * When enabling this, it's your responsability to deal with cache invalidation. 27 | * 28 | * @default false 29 | */ 30 | cache: boolean; 31 | /** 32 | * Concurrency settings. 33 | */ 34 | concurrency: { 35 | /** 36 | * Amount of concurrent promises for page scraping. 37 | * @default 2 38 | */ 39 | page: number; 40 | /** 41 | * Amount of concurrent promises for node scraping. 42 | * @default 2 43 | */ 44 | node: number; 45 | /** 46 | * Amount of concurrent promises for selector scraping. 47 | * @default 2 48 | */ 49 | selector: number; 50 | }; 51 | } 52 | 53 | export const defaultOptions: ScraperOptions = { 54 | log: process.env.DEBUG === 'true', 55 | cache: false, 56 | concurrency: { 57 | page: 2, 58 | node: 2, 59 | selector: 2, 60 | }, 61 | }; 62 | 63 | export type Scraper = ReturnType; 64 | 65 | /** 66 | * Creates a new scraper runner. 67 | * 68 | * This method is papercut entrypoint. It will create 69 | * an Scraper struct containing a runner that you can tweak 70 | * as needed. 71 | * 72 | * The runner is going to abide to the settings given 73 | * during the creation of this object. 74 | * 75 | * This function will also create a pino logger 76 | * and embed it within the runner. 77 | * 78 | * In case you prefer to manage the logger yourself, 79 | * please use `createRunner` instead. 80 | */ 81 | export const createScraper = (props: ScraperProps) => { 82 | const options: ScraperOptions = { 83 | ...defaultOptions, 84 | ...props.options, 85 | concurrency: { 86 | ...defaultOptions.concurrency, 87 | ...props.options?.concurrency, 88 | }, 89 | }; 90 | 91 | const logger = createLogger({ 92 | name: props.name, 93 | enabled: options.log, 94 | }); 95 | 96 | /** 97 | * The scraper struct. 98 | */ 99 | const scraper = { 100 | run: createRunner({ 101 | logger, 102 | options, 103 | }), 104 | }; 105 | 106 | return scraper; 107 | }; 108 | -------------------------------------------------------------------------------- /src/scraper/scrape.test-d.ts: -------------------------------------------------------------------------------- 1 | import { expectType } from 'tsd'; 2 | import { scrape } from './scrape'; 3 | import { Logger } from './createLogger'; 4 | import { defaultOptions } from './createScraper'; 5 | 6 | const strictResult = scrape({ 7 | strict: true, 8 | document: {} as Document, 9 | target: '.demo', 10 | logger: {} as Logger, 11 | options: defaultOptions, 12 | selectors: { 13 | foo: ({ text }) => text('.foo'), 14 | bar: () => 'bar' as const, 15 | optional: ({ element }) => 16 | element.nextElementSibling?.textContent, 17 | asyncValue: async ({ text }) => { 18 | await Promise.resolve(); 19 | return text('.async'); 20 | }, 21 | }, 22 | }); 23 | 24 | const looseResult = scrape({ 25 | strict: false, 26 | document: {} as Document, 27 | target: '.demo', 28 | logger: {} as Logger, 29 | options: defaultOptions, 30 | selectors: { 31 | foo: ({ text }) => text('.foo'), 32 | bar: () => 'bar' as const, 33 | optional: ({ element }) => 34 | element.nextElementSibling?.textContent, 35 | asyncValue: async ({ text }) => { 36 | await Promise.resolve(); 37 | return text('.async'); 38 | }, 39 | }, 40 | }); 41 | 42 | type ResultType = { 43 | foo: string; 44 | bar: 'bar'; 45 | optional: string | null | undefined; 46 | asyncValue: string; 47 | }; 48 | 49 | type ExpectedStrictResultType = Promise; 50 | type ExpectedLooseResultType = Promise[]>; 51 | 52 | expectType(strictResult); 53 | expectType(looseResult); 54 | -------------------------------------------------------------------------------- /src/scraper/scrape.ts: -------------------------------------------------------------------------------- 1 | import PromisePool from '@supercharge/promise-pool'; 2 | import { Logger } from './createLogger'; 3 | import { SelectorMap } from './createRunner'; 4 | import { ScraperOptions } from './createScraper'; 5 | 6 | import { supress } from '../utilities/supress'; 7 | import { 8 | createSelectorUtilities, 9 | SelectorUtilities, 10 | } from '../selectors/createSelectorUtilities'; 11 | import { mapNodeListToArray } from '../utilities/mapNodeListToArray'; 12 | 13 | export interface ScrapeProps< 14 | T extends SelectorMap, 15 | B extends boolean 16 | > { 17 | strict: B; 18 | target: string; 19 | document: Document; 20 | selectors: T; 21 | logger: Logger; 22 | options: ScraperOptions; 23 | } 24 | 25 | export type Awaited = T extends PromiseLike ? U : T; 26 | 27 | export type ScrapeResultType< 28 | T extends SelectorMap, 29 | B extends boolean 30 | > = B extends true 31 | ? { [Prop in keyof T]: Awaited> } 32 | : { [Prop in keyof T]?: Awaited> }; 33 | 34 | /** 35 | * the scrape function 36 | * 37 | * this function will select all target nodes from 38 | * the given document and spawn promise pools for 39 | * triggering selector scraping. 40 | * 41 | * this function is used by papercut runner with 42 | * the managed jsdom instances. 43 | * 44 | * if you want to have more control over jsdom 45 | * but still leverage papercut, you can use this 46 | * function directly instead of using `createScraper` 47 | * or `createRunner` 48 | * 49 | * @typeParam T A mapped type based on the given selectors. 50 | * @typeParam B The strict mode boolean type. Used to tweak the scrape result type strictness. 51 | * @param props The scraping properties and selectors. 52 | */ 53 | export async function scrape< 54 | T extends SelectorMap, 55 | B extends boolean 56 | >({ 57 | strict, 58 | target, 59 | document, 60 | selectors, 61 | logger, 62 | options, 63 | }: ScrapeProps) { 64 | const nodes = mapNodeListToArray(document.querySelectorAll(target)); 65 | 66 | type SelectorKey = keyof T; 67 | const selectorKeys = Object.keys(selectors) as SelectorKey[]; 68 | 69 | /** 70 | * Higher order function that will create 71 | * a selector scraper based on the given 72 | * node selectors and selectors. 73 | */ 74 | const createSelectorScraper = 75 | (selectorUtils: SelectorUtilities) => 76 | async (selectorKey: SelectorKey) => { 77 | const selectorFn = selectors[selectorKey]; 78 | 79 | const selectorScrapedValue = strict 80 | ? await selectorFn(selectorUtils, selectors) 81 | : await supress( 82 | () => selectorFn(selectorUtils, selectors), 83 | (error) => logger.error(error) 84 | ); 85 | 86 | return { 87 | [selectorKey]: selectorScrapedValue, 88 | } as ScrapeResultType; 89 | }; 90 | 91 | /** 92 | * The node scraper 93 | * 94 | * This function will create the node selectors, 95 | * a selector scraper, and run the selector scraper 96 | * for each node using a Promise Pool. 97 | * 98 | * Concurrency can be controlled via papercut options. 99 | */ 100 | const scrapeNode = async (node: Element) => { 101 | const nodeSelectorUtilities = createSelectorUtilities(node); 102 | 103 | const { results: scrapeResults } = 104 | await PromisePool.withConcurrency(options.concurrency.selector) 105 | .for(selectorKeys) 106 | .process(createSelectorScraper(nodeSelectorUtilities)); 107 | 108 | const nodeScrapeResult = scrapeResults.reduce( 109 | (accumulator, scrapeResult) => ({ 110 | ...accumulator, 111 | ...scrapeResult, 112 | }), 113 | {} as ScrapeResultType 114 | ); 115 | 116 | return nodeScrapeResult; 117 | }; 118 | 119 | /** 120 | * Trigger node scrapers. 121 | * Concurrency can be configured using Papercut options. 122 | */ 123 | const { results } = await PromisePool.withConcurrency( 124 | options.concurrency.node 125 | ) 126 | .for(nodes) 127 | .process(scrapeNode); 128 | 129 | return results; 130 | } 131 | -------------------------------------------------------------------------------- /src/selectors/createSelectorUtilities.ts: -------------------------------------------------------------------------------- 1 | import { geosearch } from '../http/geosearch'; 2 | import { fetchPage } from '../http/fetchPage'; 3 | import { createWindow } from '../utilities/createWindow'; 4 | import { mapNodeListToArray } from '../utilities/mapNodeListToArray'; 5 | 6 | export type SelectorUtilities = ReturnType< 7 | typeof createSelectorUtilities 8 | >; 9 | 10 | /** 11 | * This method creates the selector utilities provided 12 | * to every selector function given to the scrape method. 13 | * 14 | * These utilities are meant to make the experience of 15 | * using papercut a bit more pleasant. They're currently 16 | * not extendable, but one could, in theory, create higher 17 | * order functions extension. 18 | * 19 | * Almost every single one of these methods have a default 20 | * fallback of an empty string, in case it fails to find the 21 | * element or a specific property. 22 | * 23 | * At the same time, you also have direct access to the elementfrom selector functions if needed for more complex tasks. 24 | */ 25 | export const createSelectorUtilities = (element: Element) => { 26 | const $ = element.querySelector.bind(element); 27 | const attr = (attribute: string) => (selector: string) => { 28 | const fallback = ''; 29 | const innerElement = $(selector); 30 | 31 | if (!innerElement) { 32 | return fallback; 33 | } 34 | 35 | const attr = 36 | attribute === 'textContent' 37 | ? innerElement[attribute] 38 | : innerElement.getAttribute(attribute); 39 | 40 | return attr ?? fallback; 41 | }; 42 | 43 | const all = (selector: string) => { 44 | const nodes = element.querySelectorAll(selector); 45 | 46 | return { 47 | nodes, 48 | asArray: mapNodeListToArray(nodes), 49 | }; 50 | }; 51 | 52 | return { 53 | text: attr('textContent'), 54 | src: attr('src'), 55 | href: attr('href'), 56 | className: attr('class'), 57 | attr: (selector: string, attribute: string) => 58 | attr(attribute)(selector), 59 | all, 60 | element, 61 | geosearch, 62 | fetchPage, 63 | createWindow, 64 | mapNodeListToArray, 65 | }; 66 | }; 67 | -------------------------------------------------------------------------------- /src/utilities/createWindow.ts: -------------------------------------------------------------------------------- 1 | import { DOMWindow, JSDOM } from 'jsdom'; 2 | 3 | export const createWindow = (htmlContent: string) => { 4 | let window: DOMWindow | null = new JSDOM(htmlContent).window; 5 | let document: Document | null = window.document; 6 | 7 | return { 8 | window, 9 | document, 10 | close: () => { 11 | window?.close(); 12 | window = null; 13 | document = null; 14 | }, 15 | }; 16 | }; 17 | -------------------------------------------------------------------------------- /src/utilities/flat.ts: -------------------------------------------------------------------------------- 1 | export const flat = (array: T[][]): T[] => 2 | ([] as T[]).concat(...array); 3 | -------------------------------------------------------------------------------- /src/utilities/hash.ts: -------------------------------------------------------------------------------- 1 | import { createHash } from 'crypto'; 2 | 3 | export const hash = (str: string) => { 4 | return createHash('sha256').update(str).digest('hex'); 5 | }; 6 | -------------------------------------------------------------------------------- /src/utilities/mapNodeListToArray.ts: -------------------------------------------------------------------------------- 1 | export const mapNodeListToArray = (nodeList: NodeList): Element[] => { 2 | return Array.prototype.slice.call(nodeList); 3 | }; 4 | -------------------------------------------------------------------------------- /src/utilities/supress.ts: -------------------------------------------------------------------------------- 1 | export const supress = async ( 2 | fn: () => T | Promise, 3 | onError?: (err: any) => void 4 | ) => { 5 | try { 6 | return await fn(); 7 | } catch (err) { 8 | if (onError) { 9 | onError(err); 10 | } 11 | return undefined; 12 | } 13 | }; 14 | -------------------------------------------------------------------------------- /template.md: -------------------------------------------------------------------------------- 1 | # Papercut 2 | 3 | [![NPM](https://img.shields.io/npm/v/@armand1m/papercut.svg)](https://www.npmjs.com/package/@armand1m/papercut) 4 | [![codecov](https://codecov.io/gh/armand1m/papercut/branch/master/graph/badge.svg)](https://codecov.io/gh/armand1m/papercut) 5 | [![bundlephobia](https://badgen.net/bundlephobia/min/@armand1m/papercut)](https://bundlephobia.com/result?p=@armand1m/papercut) 6 | [![bundlephobia](https://badgen.net/bundlephobia/minzip/@armand1m/papercut)](https://bundlephobia.com/result?p=@armand1m/papercut) 7 | 8 | > Papercut is a scraping/crawling library for Node.js, written in Typescript. 9 | 10 | Papercut provides a small type-safe and tested foundation that makes it fairly easy to scrape webpages with confidence. 11 | 12 | ## Features 13 | 14 | ### Selectors API 15 | 16 | Inspired by GraphQL Resolvers, Papercut works similarly by allowing you to specify selectors for each scraper runner. 17 | The type definition for the scrape result array items is guaranteed to be compliant with the selectors given. 18 | 19 | ### JSDOM Integration 20 | 21 | Instead of relying on a headless browser engine, papercut relies on JSDOM to process client-side javascript code. This means that Papercut is also able to scrape Single Page Applications _(to a certain extent)_. 22 | 23 | ### Concurrency controls 24 | 25 | Papercut makes usage of Promise Pools to run pagination, node scraping and selector scraping. It comes with sane defaults for simple tasks, but configurable properties to make sure you have the flexibility to suit your needs. 26 | 27 | ### Pagination 28 | 29 | In most cases when web scraping, you're looking to scrape a feed. This feed can be quite long and you might have other challenges like pagination and a hard to predict total number of pages. 30 | 31 | Luckily, most of the time, there is some way to figure the last page number in the UI. Papercut allows you to set a selector to find an element that contains the last page number and a callback for creating the url for each page number using the base url. 32 | 33 | As page urls are not always implemented in the same way, Papercut leaves it up to you to tell it how to build it. 34 | 35 | ### Page Caching 36 | 37 | As many websites introduce rate limits or blocks for scrapers, page caching is a useful feature for scraping. 38 | 39 | Once Papercut hits a page, it stores the payload locally in order to reuse it for subsequent executions. This reduces the need for network requests. 40 | 41 | **Note:** when scraping a big amount of pages, be mindful about disk space. Papercut **does not** handle cache invalidation. 42 | 43 | ### Cached Geosearch 44 | 45 | Sometimes when scraping pages for a list of locations, you might want to convert those into latitude and longitude points. Papercut comes with a geosearch handler with caching that enables you to convert scraped addresses into lat/lng objects. 46 | 47 | To avoid overloading the services that papercut uses for that _(like Nominatin from OpenStreetMap)_, we cache the results to save on subsequent requests and add concurrency limits to comply with rate limits. 48 | 49 | ### Easy for simple tasks, flexible for difficult ones 50 | 51 | Papercut offers a nice selector foundation for basic needs of a scraping tooling. Text, attributes, url, image srcs, and many other handy selectors. 52 | 53 | When you face yourself with a situation where a simple selector wouldn't be enough: you'll still be able to access the element, the window, or even create a new window instance if needed. 54 | 55 | As tasks can grow on complexity, Papercut focus on being a guardrail but not a gatekeeper. 56 | 57 | ## Usage/Examples 58 | 59 | You can find more examples in the `./examples` folder. 60 | 61 | ### Quick example 62 | 63 | Create an empty project with yarn: 64 | 65 | ```sh 66 | mkdir papercut-demo 67 | cd papercut-demo 68 | yarn init -y 69 | ``` 70 | 71 | Add papercut and the needed peer dependencies: 72 | 73 | ```sh 74 | yarn add @armand1m/papercut jsdom pino 75 | ``` 76 | 77 | #### Single page scraper 78 | 79 | For this example, we gonna scrape Hacker News first page. 80 | 81 | Setup a scraper instance and set the selectors using the utilities offered: 82 | 83 | ```ts file=./examples/typescript/src/hacker-news/scraper.ts 84 | ``` 85 | 86 | Then run it using `node` or `ts-node`: 87 | 88 | ```sh 89 | npx ts-node ./single-page-scraper.ts 90 | ``` 91 | 92 | #### Paginated scraper 93 | 94 | For this example, because I live in Amsterdam, we gonna scrape the Amsterdam Coffeeshops website for all coffeeshops in Amsterdam. 95 | 96 | Setup a scraper instance and set the selectors using the utilities offered: 97 | 98 | ```ts file=./examples/typescript/src/amsterdam-coffeeshops/scraper.ts 99 | ``` 100 | 101 | Then run it using `node` or `ts-node`: 102 | 103 | ```sh 104 | npx ts-node ./paginated-scraper.ts 105 | ``` 106 | 107 | #### Managed JSDOM 108 | 109 | In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the `scrape` function instead. 110 | 111 | In the example below, we use the exposed `createWindow` and `fetchPage` utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired. 112 | 113 | ```ts file=./examples/typescript/src/managed-jsdom/scraper.ts 114 | ``` 115 | 116 | Then run it using `node` or `ts-node`: 117 | 118 | ```sh 119 | npx ts-node ./managed-jsdom.ts 120 | ``` 121 | 122 | ## API Reference 123 | 124 | [Click here to open the API reference.](https://armand1m.github.io/papercut) 125 | 126 | ## Environment Variables 127 | 128 | Papercut works well out of the box, but some environment variables are available for customizing behavior: 129 | 130 | `DEBUG=true`: enables debug level logs. 131 | 132 | ## Roadmap 133 | 134 | - [x] Add unit tests 135 | - [x] Add documentation generation 136 | - [x] Create a gh-pages for the library 137 | - [x] Create more examples 138 | - [ ] Create medium article introducing the library 139 | 140 | ## Contributing 141 | 142 | Contributions are always welcome! 143 | 144 | See `CONTRIBUTING.md` for ways to get started. 145 | 146 | ## FAQ 147 | 148 | #### Why not use `puppeteer`, `selenium` or `webdriver`? 149 | 150 | JSDOM is lighter and easier than using a headless browser engine and _(I hope that it)_ allows for enough scraping capabilities. Setup is minimal and it works out-of-the box with minimal overhead to users of this library. Please open an issue if you'd like to discuss more about this, I can definitely be wrong. 151 | 152 | #### Why not use `cheerio`? 153 | 154 | I like the idea. I see papercut being flexible in the future to use different engines, so you'd be able to switch from JSDOM to cheerio, though I'm not sure if I see much value on it. Please open an issue if you'd like to discuss a possible API implementation here. 155 | 156 | ## Contributors 157 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["src", "types"], 3 | "compilerOptions": { 4 | "module": "esnext", 5 | "lib": ["dom", "esnext"], 6 | "pretty": true, 7 | "removeComments": false, 8 | "noImplicitAny": true, 9 | "noImplicitThis": true, 10 | "alwaysStrict": true, 11 | "strictFunctionTypes": true, 12 | "strictPropertyInitialization": true, 13 | "forceConsistentCasingInFileNames": true, 14 | "diagnostics": true, 15 | "listEmittedFiles": true, 16 | "strictNullChecks": true, 17 | "experimentalDecorators": true, 18 | "isolatedModules": true, 19 | "skipLibCheck": true, 20 | "resolveJsonModule": true, 21 | "importHelpers": false, 22 | "declaration": true, 23 | "sourceMap": true, 24 | "rootDir": "./src", 25 | "strict": true, 26 | "noUnusedLocals": true, 27 | "noUnusedParameters": true, 28 | "noImplicitReturns": true, 29 | "noFallthroughCasesInSwitch": true, 30 | "moduleResolution": "node", 31 | "baseUrl": "./", 32 | "paths": { 33 | "*": ["src/*", "node_modules/*"] 34 | }, 35 | "jsx": "react", 36 | "esModuleInterop": true, 37 | "downlevelIteration": false 38 | } 39 | } 40 | --------------------------------------------------------------------------------