├── .circleci └── config.yml ├── .eslintrc.json ├── .github └── workflows │ ├── github_pages.yml │ └── netlify.yml ├── .gitignore ├── .nvmrc ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── action.yml ├── build ├── index.js ├── index.js.map └── sourcemap-register.js ├── cover.jpg ├── examples ├── basic.yml ├── github_pages.yml ├── netlify.yml ├── vercel_pr.yml └── vercel_push.yml ├── jest.config.js ├── package.json ├── public ├── github-pages │ ├── 1.html │ ├── 2.html │ └── index.html ├── netlify │ ├── 1.html │ ├── 2.html │ └── index.html └── vercel │ ├── 1.html │ ├── 2.html │ └── index.html ├── release.config.js ├── renovate.json ├── src ├── crawler-api-client.ts ├── helpers.test.ts ├── helpers.ts ├── index.ts └── types │ ├── algoliaSettings.ts │ ├── config.ts │ ├── configJson.ts │ ├── fileTypes.ts │ ├── github.ts │ ├── publicApiJsonResponses.ts │ └── utils.ts ├── tsconfig.json └── yarn.lock /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | aliases: 3 | # Forward the current folder when using wokflows 4 | # persist-to-workspace & attach-workspace 5 | - &persist-work-dir 6 | root: . 7 | paths: . 8 | 9 | - &attach-work-dir 10 | at: ~/project/ 11 | 12 | # Dependencies 13 | - &yarn 14 | name: Run Yarn 15 | command: | 16 | yarn install --non-interactive --cache-folder ~/.cache/yarn 17 | defaults: &defaults 18 | working_directory: ~/project 19 | 20 | version: 2 21 | 22 | jobs: 23 | checkout: 24 | docker: 25 | - image: cimg/node:16.20.2 26 | steps: 27 | - checkout 28 | - run: *yarn 29 | - persist-to-workspace: *persist-work-dir 30 | 31 | lint: 32 | <<: *defaults 33 | docker: 34 | - image: cimg/node:16.20.2 35 | 36 | steps: 37 | - attach-workspace: *attach-work-dir 38 | - run: yarn lint 39 | 40 | test: 41 | <<: *defaults 42 | docker: 43 | - image: cimg/node:16.20.2 44 | 45 | steps: 46 | - attach-workspace: *attach-work-dir 47 | - run: yarn test 48 | 49 | release: 50 | <<: *defaults 51 | docker: 52 | - image: cimg/node:16.20.2 53 | steps: 54 | - attach-workspace: *attach-work-dir 55 | - run: yarn build && yarn compile && yarn semantic-release 56 | 57 | workflows: 58 | version: 2 59 | suite: 60 | jobs: 61 | - checkout 62 | - test: 63 | requires: 64 | - checkout 65 | - lint: 66 | requires: 67 | - checkout 68 | - release: 69 | requires: 70 | - test 71 | - lint 72 | filters: 73 | branches: 74 | only: main 75 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "algolia", 4 | "algolia/typescript", 5 | "algolia/jest" 6 | ], 7 | "env": { 8 | "es6": true, 9 | "jest": true 10 | }, 11 | "plugins": [ 12 | "jest" 13 | ], 14 | "rules": { 15 | "import/no-commonjs": "off", 16 | "spaced-comment": "off", 17 | // TMP 18 | "jsdoc/check-examples": [ 19 | "off" 20 | ] 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/github_pages.yml: -------------------------------------------------------------------------------- 1 | name: Github Pages -> Algolia Crawler 2 | on: 3 | push: 4 | branches: [ main ] 5 | 6 | jobs: 7 | algolia_recrawl: 8 | name: Algolia Recrawl 9 | runs-on: ubuntu-latest 10 | steps: 11 | # checkout this repo 12 | - name: Checkout Repo 13 | uses: actions/checkout@v2 14 | - name: Sleep for 30s 15 | run: sleep 30 16 | - name: Github-pages-MAIN => Algolia crawler creation and recrawl (Push on Main branch) 17 | uses: ./ 18 | id: crawler_push 19 | with: 20 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 21 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 22 | crawler-api-base-url: 'https://crawler-dev.algolia.com/api/1' 23 | crawler-name: gpages-github-actions-test-${{ github.ref }} 24 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 25 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 26 | site-url: 'https://community.algolia.com/algoliasearch-crawler-github-actions/public/github-pages/' 27 | override-config: true 28 | -------------------------------------------------------------------------------- /.github/workflows/netlify.yml: -------------------------------------------------------------------------------- 1 | name: Netlify -> Algolia Crawler 2 | on: 3 | push: 4 | branches: [ main ] 5 | pull_request: 6 | types: ['opened', 'edited', 'reopened', 'synchronize'] 7 | 8 | jobs: 9 | algolia_recrawl: 10 | name: Algolia Recrawl 11 | runs-on: ubuntu-latest 12 | steps: 13 | # checkout this repo 14 | - name: Checkout Repo 15 | uses: actions/checkout@v2 16 | - name: Sleep for 30s 17 | run: sleep 30 18 | - name: Netlify-PR => Algolia crawler creation and recrawl on preview (Pull Request) 19 | if: github.ref != 'refs/heads/main' 20 | uses: ./ 21 | id: crawler_pr 22 | with: 23 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 24 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 25 | crawler-api-base-url: 'https://crawler-dev.algolia.com/api/1' 26 | crawler-name: netlify-github-actions-test-${{ github.ref }} 27 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 28 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 29 | site-url: 'https://deploy-preview-${{ github.event.pull_request.number }}--algolia-ga-actions-netlify.netlify.app/' 30 | override-config: true 31 | - name: Netlify-MAIN => Algolia crawler creation and recrawl (Push on Main branch) 32 | if: github.ref == 'refs/heads/main' 33 | uses: ./ 34 | id: crawler_push 35 | with: 36 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 37 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 38 | crawler-api-base-url: 'https://crawler.algolia.com/api/1' 39 | crawler-name: netlify-github-actions-test-${{ github.ref }} 40 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 41 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 42 | site-url: 'https://algolia-ga-actions-netlify.netlify.app/' 43 | override-config: true 44 | 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | .vscode 4 | .scannerwork 5 | dist/ 6 | node_modules/ 7 | 8 | .env 9 | .secrets 10 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 16.19.1 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [1.1.13](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.12...v1.1.13) (2024-10-22) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * change `site-url` to a more generic example ([#502](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/502)) ([a3d716d](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/a3d716da5704d8ea95e79695912bcc5d853a0e8f)) 7 | 8 | ## [1.1.12](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.11...v1.1.12) (2024-10-05) 9 | 10 | 11 | ### Bug Fixes 12 | 13 | * **deps:** update dependency @actions/core to v1.11.1 ([#501](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/501)) ([fd56fa8](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/fd56fa89871c59e7a446ca8db951552dc79afd37)) 14 | 15 | ## [1.1.11](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.10...v1.1.11) (2024-10-04) 16 | 17 | 18 | ### Bug Fixes 19 | 20 | * **deps:** update dependency @actions/core to v1.11.0 ([#500](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/500)) ([b4ea512](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/b4ea5129ffa12c11478009dc1c6fef9d1bac3494)) 21 | 22 | ## [1.1.10](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.9...v1.1.10) (2023-09-15) 23 | 24 | 25 | ### Bug Fixes 26 | 27 | * **deps:** update dependency @actions/core to v1.10.1 ([#463](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/463)) ([36f6ee6](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/36f6ee63df16dfffdf56d0307e8c1a82b8b2a853)) 28 | 29 | ## [1.1.9](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.8...v1.1.9) (2022-09-30) 30 | 31 | 32 | ### Bug Fixes 33 | 34 | * **deps:** update dependency @actions/github to v5.1.1 ([#335](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/335)) ([c631bdf](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/c631bdfcfcbfad5b828aeac041f789b4ef84b053)) 35 | 36 | ## [1.1.8](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.7...v1.1.8) (2022-09-30) 37 | 38 | 39 | ### Bug Fixes 40 | 41 | * **deps:** update dependency @actions/core to v1.10.0 ([#334](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/334)) ([00d12b2](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/00d12b271c79ce11ed3c7ce3b623959dfe9a64e2)) 42 | 43 | ## [1.1.7](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.6...v1.1.7) (2022-09-23) 44 | 45 | 46 | ### Bug Fixes 47 | 48 | * **deps:** update dependency @actions/github to v5.1.0 ([#327](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/327)) ([5018340](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/50183404fc69aa16428b62ea263922c719c37e9d)) 49 | 50 | ## [1.1.6](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.5...v1.1.6) (2022-08-12) 51 | 52 | 53 | ### Bug Fixes 54 | 55 | * **deps:** update dependency @actions/core to v1.9.1 ([#299](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/299)) ([3f7dd03](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/3f7dd034cd4baf3163b44a544908ed31a6806681)) 56 | 57 | ## [1.1.5](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.4...v1.1.5) (2022-06-18) 58 | 59 | 60 | ### Bug Fixes 61 | 62 | * **deps:** update dependency @actions/core to v1.9.0 ([#268](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/268)) ([fa7bb30](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/fa7bb30aedc94fed64e59fde0c0541a8459f4e89)) 63 | 64 | ## [1.1.4](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.3...v1.1.4) (2022-05-13) 65 | 66 | 67 | ### Bug Fixes 68 | 69 | * **deps:** update dependency @actions/github to v5.0.3 ([#229](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/229)) ([142b0bc](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/142b0bc43f19ceec410599d6d0bbeafe0c8b43fb)) 70 | 71 | ## [1.1.3](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.2...v1.1.3) (2022-05-13) 72 | 73 | 74 | ### Bug Fixes 75 | 76 | * **deps:** update dependency @actions/core to v1.8.2 ([#228](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/228)) ([437df5e](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/437df5ed2c0e72837d7c4f17c6ede1cbbf8f0481)) 77 | 78 | ## [1.1.2](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.1...v1.1.2) (2022-05-06) 79 | 80 | 81 | ### Bug Fixes 82 | 83 | * **deps:** update dependency @actions/core to v1.8.0 ([#224](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/224)) ([dfcfe0e](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/dfcfe0efa9100369386306a9d44eaa0fac06bf5b)) 84 | 85 | ## [1.1.1](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.1.0...v1.1.1) (2022-04-29) 86 | 87 | 88 | ### Bug Fixes 89 | 90 | * **deps:** update dependency @actions/core to v1.7.0 ([#220](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/220)) ([1e2812f](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/1e2812fa822256ce9b49a9ddd6c24623c63bc8ee)) 91 | 92 | # [1.1.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.14...v1.1.0) (2022-04-12) 93 | 94 | 95 | ### Features 96 | 97 | * add testing ([#203](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/203)) ([cc499fb](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/cc499fbc73cc6221f91f12f7c89501123e2d6c53)) 98 | 99 | ## [1.0.14](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.13...v1.0.14) (2022-04-04) 100 | 101 | 102 | ### Bug Fixes 103 | 104 | * **deps:** update dependency @actions/github to v5.0.1 ([#196](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/196)) ([c2a0c9a](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/c2a0c9ac09056730da31827d7b89fd56e2b03480)) 105 | 106 | ## [1.0.13](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.12...v1.0.13) (2022-03-18) 107 | 108 | 109 | ### Bug Fixes 110 | 111 | * **deps:** update dependency node-fetch to v3.2.3 ([#183](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/183)) ([f07164d](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/f07164d805ebca9f18cd871264af05d77f242937)) 112 | 113 | ## [1.0.12](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.11...v1.0.12) (2022-03-11) 114 | 115 | 116 | ### Bug Fixes 117 | 118 | * **deps:** update dependency node-fetch to v3.2.2 ([#173](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/173)) ([30f04a3](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/30f04a3c7f0241085fcb5b27ab9525d93386e819)) 119 | 120 | ## [1.0.11](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.10...v1.0.11) (2022-01-24) 121 | 122 | 123 | ### Bug Fixes 124 | 125 | * **deps:** update dependency node-fetch to v3.2.0 ([#165](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/165)) ([d4f4625](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/d4f4625f4fd8f11683bb72ead9b557d1e4902b04)) 126 | 127 | ## [1.0.10](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.9...v1.0.10) (2021-11-24) 128 | 129 | 130 | ### Bug Fixes 131 | 132 | * cast boolean ([8593541](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/85935415e86157bede081bff58dfc07b4d33b5c1)) 133 | 134 | ## [1.0.9](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.8...v1.0.9) (2021-11-24) 135 | 136 | 137 | ### Bug Fixes 138 | 139 | * core.setFailed ([956f188](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/956f188ad9c78db5d3473730c48347f833a93464)) 140 | 141 | ## [1.0.8](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.7...v1.0.8) (2021-11-24) 142 | 143 | 144 | ### Reverts 145 | 146 | * Revert "fix(deps): update dependency node-fetch to v3.1.0" ([359b47e](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/359b47e0c8c54f612196aa8728e386a79d0c4b63)) 147 | 148 | ## [1.0.7](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.6...v1.0.7) (2021-11-24) 149 | 150 | 151 | ### Bug Fixes 152 | 153 | * **action:** use node12 runner ([#111](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/111)) ([2559b93](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/2559b93e2d76d94fa043c560b5571d8d83fc13bb)) 154 | 155 | ## [1.0.6](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.5...v1.0.6) (2021-11-13) 156 | 157 | 158 | ### Bug Fixes 159 | 160 | * **deps:** update dependency node-fetch to v3.1.0 ([51d393f](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/51d393ff67933c7eb00b320550492529ed3c18cf)) 161 | 162 | ## [1.0.5](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.4...v1.0.5) (2021-11-02) 163 | 164 | 165 | ### Bug Fixes 166 | 167 | * lint ([5d4c532](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/5d4c53267622ea0d45d2e39bf5abc74341fa0392)) 168 | 169 | ## [1.0.4](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.3...v1.0.4) (2021-10-01) 170 | 171 | 172 | ### Bug Fixes 173 | 174 | * **deps:** update dependency @actions/core to v1.6.0 ([b367847](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/b367847d383bee338124f9a9a11c050bf9e81494)) 175 | 176 | ## [1.0.3](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.2...v1.0.3) (2021-09-27) 177 | 178 | 179 | ### Bug Fixes 180 | 181 | * remove unnecessary lock ([428ef97](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/428ef97a8b3ecf4e7ab4bd6e7566c1d301a8b7ec)) 182 | 183 | ## [1.0.2](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.1...v1.0.2) (2021-09-27) 184 | 185 | 186 | ### Bug Fixes 187 | 188 | * add user-agent ([36addeb](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/36addebf7d200ea17d17d311207a1a873e99d29a)) 189 | 190 | ## [1.0.1](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v1.0.0...v1.0.1) (2021-09-24) 191 | 192 | 193 | ### Bug Fixes 194 | 195 | * documentation update ([e45ae73](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/e45ae73cd47e58ef1eb4e0be363c50312bab62f5)) 196 | 197 | # [0.9.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.8.4...v0.9.0) (2021-09-24) 198 | 199 | 200 | ### Features 201 | 202 | * BREAKING CHANGE v1 ([8fcfee3](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/8fcfee3a5c727f2cfbfc6555d0392b17a3bfe74e)) 203 | 204 | ## [0.8.4](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.8.3...v0.8.4) (2021-09-24) 205 | 206 | 207 | ### Bug Fixes 208 | 209 | * typing issue ([8afa9a0](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/8afa9a07515e632e8516a9bd00e3a1b236065cb1)) 210 | * **deps:** update dependency node-fetch to v3 ([#48](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/48)) ([9341f6a](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/9341f6a2fa2fe583f63a7bae178a65eb0a821e1c)) 211 | 212 | ## [0.8.3](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.8.2...v0.8.3) (2021-09-17) 213 | 214 | 215 | ### Bug Fixes 216 | 217 | * **deps:** update dependency node-fetch to v2.6.2 ([a5f735c](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/a5f735c1a4a1d3056fba010916f9761a0cec1595)) 218 | 219 | ## [0.8.2](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.8.1...v0.8.2) (2021-09-16) 220 | 221 | 222 | ### Bug Fixes 223 | 224 | * handle error in catch ([#57](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/57)) ([2fc63bb](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/2fc63bb64fb96b907ace4a965ea4df9820712498)) 225 | 226 | ## [0.8.1](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.8.0...v0.8.1) (2021-08-27) 227 | 228 | 229 | ### Bug Fixes 230 | 231 | * **deps:** update dependency @actions/core to v1.5.0 ([#38](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/38)) ([2bdbaeb](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/2bdbaeb32f6f916e6e29ab07583a1e47d4d4f8dc)) 232 | 233 | # [0.8.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.7.3...v0.8.0) (2021-07-20) 234 | 235 | 236 | ### Features 237 | 238 | * automatic crawler name ([#24](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/24)) ([d6861c2](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/d6861c20f5ca277178bcc79f301150d7c67e0ab8)) 239 | 240 | ## [0.7.3](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.7.2...v0.7.3) (2021-07-19) 241 | 242 | 243 | ### Bug Fixes 244 | 245 | * add compile to release process ([#23](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/23)) ([9669048](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/966904857d2c29cdb8eff9e5c499f4c4c798785f)) 246 | 247 | ## [0.7.2](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.7.1...v0.7.2) (2021-07-16) 248 | 249 | 250 | ### Bug Fixes 251 | 252 | * check if the comment exists or not before creating it. ([#22](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/22)) ([3ba3f5b](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/3ba3f5b4c71981e4b2e1870e2c01473ddd2f500f)) 253 | 254 | ## [0.7.1](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.7.0...v0.7.1) (2021-07-13) 255 | 256 | 257 | ### Bug Fixes 258 | 259 | * update readme ([646ff1d](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/646ff1d6cd50b7b111990d1ff2d681b995522345)) 260 | 261 | # [0.7.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.6.0...v0.7.0) (2021-07-13) 262 | 263 | 264 | ### Features 265 | 266 | * Add comment on PR ([#20](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/20)) ([6ed9d2c](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/6ed9d2c6cf0293afd1703537572daccf2d65ddb7)) 267 | 268 | # [0.6.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.5.0...v0.6.0) (2021-07-02) 269 | 270 | 271 | ### Features 272 | 273 | * use new crawler parameters name and appId ([#19](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/19)) ([7b5c1a3](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/7b5c1a3ab4b1cb6c1e2bcadb8be866006aef8bc1)) 274 | 275 | # [0.5.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.4.2...v0.5.0) (2021-06-30) 276 | 277 | 278 | ### Features 279 | 280 | * Add override config boolean to the Github action config ([#18](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/18)) ([c28306e](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/c28306e8ab3d7997978bf8fd5f8f5652419f9cd4)) 281 | 282 | ## [0.4.2](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.4.1...v0.4.2) (2021-06-12) 283 | 284 | 285 | ### Bug Fixes 286 | 287 | * **deps:** update dependency @actions/core to v1.4.0 ([0c8e0e5](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/0c8e0e5ccfdedcfe2b303c888de134fae070a9c2)) 288 | 289 | ## [0.4.1](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.4.0...v0.4.1) (2021-06-12) 290 | 291 | 292 | ### Bug Fixes 293 | 294 | * **deps:** pin dependencies ([ea82417](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/ea824172f181d1354e4b0c6fd65333740a254da6)) 295 | 296 | # [0.4.0](https://github.com/algolia/algoliasearch-crawler-github-actions/compare/v0.3.1...v0.4.0) (2021-06-10) 297 | 298 | 299 | ### Features 300 | 301 | * Upgrade TS strictness ([#1](https://github.com/algolia/algoliasearch-crawler-github-actions/issues/1)) ([5a15853](https://github.com/algolia/algoliasearch-crawler-github-actions/commit/5a15853b7efc1eeb3db58e14b4737e63a2df82eb)) 302 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ##  Commands 4 | 5 | ```sh 6 | yarn test # runs tests within src/ directory 7 | yarn build # builds the project 8 | yarn clean # empties the dist/ directory 9 | yarn lint # runs eslinter 10 | yarn semantic-release # runs semantic-release 11 | yarn compile # compiles the index.js file with vercel/ncc 12 | ``` 13 | 14 | ## Releasing 15 | 16 | - Use a semantic commit 17 | - CI will compile and commit the `build/` (you don't need to commit it) 18 | - A release will be created but not published to the marketplace 19 | - Go to Releases 20 | - Edit the release and check "Publish this Action to the GitHub Marketplace" 21 | - Save 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-present Algolia, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Algolia Crawler Github Action 3 | 4 | [![CircleCI](https://circleci.com/gh/algolia/algoliasearch-crawler-github-actions/tree/main.svg?style=svg)](https://circleci.com/gh/algolia/algoliasearch-crawler-github-actions/tree/main) 5 | 6 | # Algolia Crawler Github Action 7 | 8 | Automatically index your website to Algolia when deploying your website with the Algolia Crawler. 9 | **IMPORTANT**: This Github Action is **only available for Algolia users with Crawler Public API access**. 10 | 11 | - [What is Algolia?](https://www.algolia.com/doc/guides/getting-started/what-is-algolia/) 12 | - [What is Algolia's Crawler?](https://www.algolia.com/doc/tools/crawler/getting-started/overview/) 13 | 14 | ## Platforms support 15 | 16 | It should be compatible with **any hosts** as long as you provide the correct `site-url`. 17 | On top of that, it has been tested with the following platforms: 18 | 19 | - Github Pages 20 | - Microsoft Azure 21 | - Netlify 22 | - Vercel 23 | 24 | ## How to add this Github Action to your workflow ? 25 | 26 | On your repository: 27 | 28 | - Create a Github Workflow file `.github/workflows/[FILENAME].yml`. 29 | - Add a new job after your website deployment. For the Crawler to work, it needs an up and running website. 30 | 31 | ```yaml 32 | - name: Algolia crawler creation and crawl 33 | uses: algolia/algoliasearch-crawler-github-actions@v1.0.10 34 | id: algolia_crawler 35 | with: # mandatory parameters 36 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 37 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 38 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 39 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 40 | site-url: 'https://example.com' 41 | ``` 42 | 43 | ## Example 44 | 45 | - [Basic](/examples/basic.yml) 46 | - [Github Pages](/examples/github_pages.yml) 47 | - [Netlify](/examples/netlify.yml) 48 | - Vercel: [PR](/examples/vercel_pr.yml), [Main branch](/examples/vercel_push.yml) 49 | 50 | ## Parameters to provide 51 | 52 | ### Mandatory parameters 53 | 54 | - `crawler-user-id` 55 | 56 | User Id of your crawler account 57 | - `crawler-api-key` 58 | 59 | Api Key of your crawler account 60 | - `algolia-app-id` 61 | 62 | Algolia Application ID 63 | - `algolia-api-key` 64 | 65 | Algolia API Key 66 | - `site-url` 67 | 68 | URL of the website to crawl 69 | 70 | ### Optional parameters 71 | 72 | - `crawler-api-base-url` 73 | 74 | Base URL of the crawler, default: [https://crawler.algolia.com/api/1/](https://crawler.algolia.com/api/1/) 75 | - `crawler-name` 76 | 77 | Name of the created crawler, default: `'[Github] ${{ github.repository }} ${{ github.ref }}'` 78 | - `override-config` 79 | 80 | Boolean to define if you want your crawler config to be overriden or not, default: `false` 81 | - `github-token` 82 | 83 | Needed for adding comments to PR, default: Github Action `${{ github.token }}` variable 84 | 85 | ## Github secrets on your repository 86 | 87 | We highly recommend not to define sensitive information such as Algolia and/or Crawler credentials directly in the YAML file and to **use Github secrets** (defined in Settings > Secrets). 88 | 89 | ### Recommended 90 | 91 | - `ALGOLIA_API_KEY` 92 | 93 | Algolia Application ID 94 | - `ALGOLIA_APP_ID` 95 | 96 | Algolia API Key 97 | - `CRAWLER_API_KEY` 98 | 99 | Api Key of your crawler account 100 | - `CRAWLER_USER_ID` 101 | 102 | User Id of your crawler account 103 | 104 | ## Troubleshooting 105 | 106 | - Need help? We have you covered in our [Discourse forum](https://discourse.algolia.com/) 107 | - Found a bug in the plugin? Please read our [contributing guide](/CONTRIBUTING.md) and either open an [issue](https://github.com/algolia/algoliasearch-crawler-github-actions/issues) or a [pull request](https://github.com/algolia/algoliasearch-crawler-github-actions/pulls) 108 | - Can't find the answer to your issue? Please reach out to [support@algolia.com](support@algolia.com) 109 | 110 | ## Development & Release 111 | 112 | See [CONTRIBUTING.md](./CONTRIBUTING.md). 113 | -------------------------------------------------------------------------------- /action.yml: -------------------------------------------------------------------------------- 1 | name: 'Algolia Crawler Automatic Crawl' 2 | description: 'Automatically trigger a crawl and push to Algolia, when deploying a website using the Algolia Crawler.' 3 | branding: 4 | icon: "upload-cloud" 5 | color: "blue" 6 | 7 | inputs: 8 | # CREDENTIALS 9 | crawler-user-id: 10 | description: 'Crawler user ID' 11 | required: true 12 | crawler-api-key: 13 | description: 'Crawler API key' 14 | required: true 15 | crawler-api-base-url: 16 | description: 'Crawler API URL' 17 | required: false 18 | default: 'https://crawler.algolia.com/api/1' 19 | github-token: 20 | description: 'Github token' 21 | required: true 22 | default: ${{ github.token }} 23 | 24 | # CRAWLER CONFIGURATION 25 | crawler-name: 26 | description: 'Name of the crawler' 27 | required: true 28 | default: '[Github] ${{ github.repository }} ${{ github.ref }}' 29 | algolia-app-id: 30 | description: 'Algolia Application ID' 31 | required: true 32 | algolia-api-key: 33 | description: 'Algolia API key' 34 | required: true 35 | site-url: 36 | description: 'url of the site to crawl' 37 | required: true 38 | override-config: 39 | description: 'Override config in case the crawler is already existing' 40 | required: false 41 | default: false 42 | 43 | runs: 44 | using: 'node16' 45 | main: 'build/index.js' 46 | -------------------------------------------------------------------------------- /build/sourcemap-register.js: -------------------------------------------------------------------------------- 1 | (()=>{var e={650:e=>{var r=Object.prototype.toString;var n=typeof Buffer.alloc==="function"&&typeof Buffer.allocUnsafe==="function"&&typeof Buffer.from==="function";function isArrayBuffer(e){return r.call(e).slice(8,-1)==="ArrayBuffer"}function fromArrayBuffer(e,r,t){r>>>=0;var o=e.byteLength-r;if(o<0){throw new RangeError("'offset' is out of bounds")}if(t===undefined){t=o}else{t>>>=0;if(t>o){throw new RangeError("'length' is out of bounds")}}return n?Buffer.from(e.slice(r,r+t)):new Buffer(new Uint8Array(e.slice(r,r+t)))}function fromString(e,r){if(typeof r!=="string"||r===""){r="utf8"}if(!Buffer.isEncoding(r)){throw new TypeError('"encoding" must be a valid string encoding')}return n?Buffer.from(e,r):new Buffer(e,r)}function bufferFrom(e,r,t){if(typeof e==="number"){throw new TypeError('"value" argument must not be a number')}if(isArrayBuffer(e)){return fromArrayBuffer(e,r,t)}if(typeof e==="string"){return fromString(e,r)}return n?Buffer.from(e):new Buffer(e)}e.exports=bufferFrom},274:(e,r,n)=>{var t=n(339);var o=Object.prototype.hasOwnProperty;var i=typeof Map!=="undefined";function ArraySet(){this._array=[];this._set=i?new Map:Object.create(null)}ArraySet.fromArray=function ArraySet_fromArray(e,r){var n=new ArraySet;for(var t=0,o=e.length;t=0){return r}}else{var n=t.toSetString(e);if(o.call(this._set,n)){return this._set[n]}}throw new Error('"'+e+'" is not in the set.')};ArraySet.prototype.at=function ArraySet_at(e){if(e>=0&&e{var t=n(190);var o=5;var i=1<>1;return r?-n:n}r.encode=function base64VLQ_encode(e){var r="";var n;var i=toVLQSigned(e);do{n=i&a;i>>>=o;if(i>0){n|=u}r+=t.encode(n)}while(i>0);return r};r.decode=function base64VLQ_decode(e,r,n){var i=e.length;var s=0;var l=0;var c,p;do{if(r>=i){throw new Error("Expected more digits in base 64 VLQ value.")}p=t.decode(e.charCodeAt(r++));if(p===-1){throw new Error("Invalid base64 digit: "+e.charAt(r-1))}c=!!(p&u);p&=a;s=s+(p<{var n="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".split("");r.encode=function(e){if(0<=e&&e{r.GREATEST_LOWER_BOUND=1;r.LEAST_UPPER_BOUND=2;function recursiveSearch(e,n,t,o,i,a){var u=Math.floor((n-e)/2)+e;var s=i(t,o[u],true);if(s===0){return u}else if(s>0){if(n-u>1){return recursiveSearch(u,n,t,o,i,a)}if(a==r.LEAST_UPPER_BOUND){return n1){return recursiveSearch(e,u,t,o,i,a)}if(a==r.LEAST_UPPER_BOUND){return u}else{return e<0?-1:e}}}r.search=function search(e,n,t,o){if(n.length===0){return-1}var i=recursiveSearch(-1,n.length,e,n,t,o||r.GREATEST_LOWER_BOUND);if(i<0){return-1}while(i-1>=0){if(t(n[i],n[i-1],true)!==0){break}--i}return i}},680:(e,r,n)=>{var t=n(339);function generatedPositionAfter(e,r){var n=e.generatedLine;var o=r.generatedLine;var i=e.generatedColumn;var a=r.generatedColumn;return o>n||o==n&&a>=i||t.compareByGeneratedPositionsInflated(e,r)<=0}function MappingList(){this._array=[];this._sorted=true;this._last={generatedLine:-1,generatedColumn:0}}MappingList.prototype.unsortedForEach=function MappingList_forEach(e,r){this._array.forEach(e,r)};MappingList.prototype.add=function MappingList_add(e){if(generatedPositionAfter(this._last,e)){this._last=e;this._array.push(e)}else{this._sorted=false;this._array.push(e)}};MappingList.prototype.toArray=function MappingList_toArray(){if(!this._sorted){this._array.sort(t.compareByGeneratedPositionsInflated);this._sorted=true}return this._array};r.H=MappingList},758:(e,r)=>{function swap(e,r,n){var t=e[r];e[r]=e[n];e[n]=t}function randomIntInRange(e,r){return Math.round(e+Math.random()*(r-e))}function doQuickSort(e,r,n,t){if(n{var t;var o=n(339);var i=n(345);var a=n(274).I;var u=n(449);var s=n(758).U;function SourceMapConsumer(e,r){var n=e;if(typeof e==="string"){n=o.parseSourceMapInput(e)}return n.sections!=null?new IndexedSourceMapConsumer(n,r):new BasicSourceMapConsumer(n,r)}SourceMapConsumer.fromSourceMap=function(e,r){return BasicSourceMapConsumer.fromSourceMap(e,r)};SourceMapConsumer.prototype._version=3;SourceMapConsumer.prototype.__generatedMappings=null;Object.defineProperty(SourceMapConsumer.prototype,"_generatedMappings",{configurable:true,enumerable:true,get:function(){if(!this.__generatedMappings){this._parseMappings(this._mappings,this.sourceRoot)}return this.__generatedMappings}});SourceMapConsumer.prototype.__originalMappings=null;Object.defineProperty(SourceMapConsumer.prototype,"_originalMappings",{configurable:true,enumerable:true,get:function(){if(!this.__originalMappings){this._parseMappings(this._mappings,this.sourceRoot)}return this.__originalMappings}});SourceMapConsumer.prototype._charIsMappingSeparator=function SourceMapConsumer_charIsMappingSeparator(e,r){var n=e.charAt(r);return n===";"||n===","};SourceMapConsumer.prototype._parseMappings=function SourceMapConsumer_parseMappings(e,r){throw new Error("Subclasses must implement _parseMappings")};SourceMapConsumer.GENERATED_ORDER=1;SourceMapConsumer.ORIGINAL_ORDER=2;SourceMapConsumer.GREATEST_LOWER_BOUND=1;SourceMapConsumer.LEAST_UPPER_BOUND=2;SourceMapConsumer.prototype.eachMapping=function SourceMapConsumer_eachMapping(e,r,n){var t=r||null;var i=n||SourceMapConsumer.GENERATED_ORDER;var a;switch(i){case SourceMapConsumer.GENERATED_ORDER:a=this._generatedMappings;break;case SourceMapConsumer.ORIGINAL_ORDER:a=this._originalMappings;break;default:throw new Error("Unknown order of iteration.")}var u=this.sourceRoot;a.map((function(e){var r=e.source===null?null:this._sources.at(e.source);r=o.computeSourceURL(u,r,this._sourceMapURL);return{source:r,generatedLine:e.generatedLine,generatedColumn:e.generatedColumn,originalLine:e.originalLine,originalColumn:e.originalColumn,name:e.name===null?null:this._names.at(e.name)}}),this).forEach(e,t)};SourceMapConsumer.prototype.allGeneratedPositionsFor=function SourceMapConsumer_allGeneratedPositionsFor(e){var r=o.getArg(e,"line");var n={source:o.getArg(e,"source"),originalLine:r,originalColumn:o.getArg(e,"column",0)};n.source=this._findSourceIndex(n.source);if(n.source<0){return[]}var t=[];var a=this._findMapping(n,this._originalMappings,"originalLine","originalColumn",o.compareByOriginalPositions,i.LEAST_UPPER_BOUND);if(a>=0){var u=this._originalMappings[a];if(e.column===undefined){var s=u.originalLine;while(u&&u.originalLine===s){t.push({line:o.getArg(u,"generatedLine",null),column:o.getArg(u,"generatedColumn",null),lastColumn:o.getArg(u,"lastGeneratedColumn",null)});u=this._originalMappings[++a]}}else{var l=u.originalColumn;while(u&&u.originalLine===r&&u.originalColumn==l){t.push({line:o.getArg(u,"generatedLine",null),column:o.getArg(u,"generatedColumn",null),lastColumn:o.getArg(u,"lastGeneratedColumn",null)});u=this._originalMappings[++a]}}}return t};r.SourceMapConsumer=SourceMapConsumer;function BasicSourceMapConsumer(e,r){var n=e;if(typeof e==="string"){n=o.parseSourceMapInput(e)}var t=o.getArg(n,"version");var i=o.getArg(n,"sources");var u=o.getArg(n,"names",[]);var s=o.getArg(n,"sourceRoot",null);var l=o.getArg(n,"sourcesContent",null);var c=o.getArg(n,"mappings");var p=o.getArg(n,"file",null);if(t!=this._version){throw new Error("Unsupported version: "+t)}if(s){s=o.normalize(s)}i=i.map(String).map(o.normalize).map((function(e){return s&&o.isAbsolute(s)&&o.isAbsolute(e)?o.relative(s,e):e}));this._names=a.fromArray(u.map(String),true);this._sources=a.fromArray(i,true);this._absoluteSources=this._sources.toArray().map((function(e){return o.computeSourceURL(s,e,r)}));this.sourceRoot=s;this.sourcesContent=l;this._mappings=c;this._sourceMapURL=r;this.file=p}BasicSourceMapConsumer.prototype=Object.create(SourceMapConsumer.prototype);BasicSourceMapConsumer.prototype.consumer=SourceMapConsumer;BasicSourceMapConsumer.prototype._findSourceIndex=function(e){var r=e;if(this.sourceRoot!=null){r=o.relative(this.sourceRoot,r)}if(this._sources.has(r)){return this._sources.indexOf(r)}var n;for(n=0;n1){v.source=l+_[1];l+=_[1];v.originalLine=i+_[2];i=v.originalLine;v.originalLine+=1;v.originalColumn=a+_[3];a=v.originalColumn;if(_.length>4){v.name=c+_[4];c+=_[4]}}m.push(v);if(typeof v.originalLine==="number"){d.push(v)}}}s(m,o.compareByGeneratedPositionsDeflated);this.__generatedMappings=m;s(d,o.compareByOriginalPositions);this.__originalMappings=d};BasicSourceMapConsumer.prototype._findMapping=function SourceMapConsumer_findMapping(e,r,n,t,o,a){if(e[n]<=0){throw new TypeError("Line must be greater than or equal to 1, got "+e[n])}if(e[t]<0){throw new TypeError("Column must be greater than or equal to 0, got "+e[t])}return i.search(e,r,o,a)};BasicSourceMapConsumer.prototype.computeColumnSpans=function SourceMapConsumer_computeColumnSpans(){for(var e=0;e=0){var t=this._generatedMappings[n];if(t.generatedLine===r.generatedLine){var i=o.getArg(t,"source",null);if(i!==null){i=this._sources.at(i);i=o.computeSourceURL(this.sourceRoot,i,this._sourceMapURL)}var a=o.getArg(t,"name",null);if(a!==null){a=this._names.at(a)}return{source:i,line:o.getArg(t,"originalLine",null),column:o.getArg(t,"originalColumn",null),name:a}}}return{source:null,line:null,column:null,name:null}};BasicSourceMapConsumer.prototype.hasContentsOfAllSources=function BasicSourceMapConsumer_hasContentsOfAllSources(){if(!this.sourcesContent){return false}return this.sourcesContent.length>=this._sources.size()&&!this.sourcesContent.some((function(e){return e==null}))};BasicSourceMapConsumer.prototype.sourceContentFor=function SourceMapConsumer_sourceContentFor(e,r){if(!this.sourcesContent){return null}var n=this._findSourceIndex(e);if(n>=0){return this.sourcesContent[n]}var t=e;if(this.sourceRoot!=null){t=o.relative(this.sourceRoot,t)}var i;if(this.sourceRoot!=null&&(i=o.urlParse(this.sourceRoot))){var a=t.replace(/^file:\/\//,"");if(i.scheme=="file"&&this._sources.has(a)){return this.sourcesContent[this._sources.indexOf(a)]}if((!i.path||i.path=="/")&&this._sources.has("/"+t)){return this.sourcesContent[this._sources.indexOf("/"+t)]}}if(r){return null}else{throw new Error('"'+t+'" is not in the SourceMap.')}};BasicSourceMapConsumer.prototype.generatedPositionFor=function SourceMapConsumer_generatedPositionFor(e){var r=o.getArg(e,"source");r=this._findSourceIndex(r);if(r<0){return{line:null,column:null,lastColumn:null}}var n={source:r,originalLine:o.getArg(e,"line"),originalColumn:o.getArg(e,"column")};var t=this._findMapping(n,this._originalMappings,"originalLine","originalColumn",o.compareByOriginalPositions,o.getArg(e,"bias",SourceMapConsumer.GREATEST_LOWER_BOUND));if(t>=0){var i=this._originalMappings[t];if(i.source===n.source){return{line:o.getArg(i,"generatedLine",null),column:o.getArg(i,"generatedColumn",null),lastColumn:o.getArg(i,"lastGeneratedColumn",null)}}}return{line:null,column:null,lastColumn:null}};t=BasicSourceMapConsumer;function IndexedSourceMapConsumer(e,r){var n=e;if(typeof e==="string"){n=o.parseSourceMapInput(e)}var t=o.getArg(n,"version");var i=o.getArg(n,"sections");if(t!=this._version){throw new Error("Unsupported version: "+t)}this._sources=new a;this._names=new a;var u={line:-1,column:0};this._sections=i.map((function(e){if(e.url){throw new Error("Support for url field in sections not implemented.")}var n=o.getArg(e,"offset");var t=o.getArg(n,"line");var i=o.getArg(n,"column");if(t{var t=n(449);var o=n(339);var i=n(274).I;var a=n(680).H;function SourceMapGenerator(e){if(!e){e={}}this._file=o.getArg(e,"file",null);this._sourceRoot=o.getArg(e,"sourceRoot",null);this._skipValidation=o.getArg(e,"skipValidation",false);this._sources=new i;this._names=new i;this._mappings=new a;this._sourcesContents=null}SourceMapGenerator.prototype._version=3;SourceMapGenerator.fromSourceMap=function SourceMapGenerator_fromSourceMap(e){var r=e.sourceRoot;var n=new SourceMapGenerator({file:e.file,sourceRoot:r});e.eachMapping((function(e){var t={generated:{line:e.generatedLine,column:e.generatedColumn}};if(e.source!=null){t.source=e.source;if(r!=null){t.source=o.relative(r,t.source)}t.original={line:e.originalLine,column:e.originalColumn};if(e.name!=null){t.name=e.name}}n.addMapping(t)}));e.sources.forEach((function(t){var i=t;if(r!==null){i=o.relative(r,t)}if(!n._sources.has(i)){n._sources.add(i)}var a=e.sourceContentFor(t);if(a!=null){n.setSourceContent(t,a)}}));return n};SourceMapGenerator.prototype.addMapping=function SourceMapGenerator_addMapping(e){var r=o.getArg(e,"generated");var n=o.getArg(e,"original",null);var t=o.getArg(e,"source",null);var i=o.getArg(e,"name",null);if(!this._skipValidation){this._validateMapping(r,n,t,i)}if(t!=null){t=String(t);if(!this._sources.has(t)){this._sources.add(t)}}if(i!=null){i=String(i);if(!this._names.has(i)){this._names.add(i)}}this._mappings.add({generatedLine:r.line,generatedColumn:r.column,originalLine:n!=null&&n.line,originalColumn:n!=null&&n.column,source:t,name:i})};SourceMapGenerator.prototype.setSourceContent=function SourceMapGenerator_setSourceContent(e,r){var n=e;if(this._sourceRoot!=null){n=o.relative(this._sourceRoot,n)}if(r!=null){if(!this._sourcesContents){this._sourcesContents=Object.create(null)}this._sourcesContents[o.toSetString(n)]=r}else if(this._sourcesContents){delete this._sourcesContents[o.toSetString(n)];if(Object.keys(this._sourcesContents).length===0){this._sourcesContents=null}}};SourceMapGenerator.prototype.applySourceMap=function SourceMapGenerator_applySourceMap(e,r,n){var t=r;if(r==null){if(e.file==null){throw new Error("SourceMapGenerator.prototype.applySourceMap requires either an explicit source file, "+'or the source map\'s "file" property. Both were omitted.')}t=e.file}var a=this._sourceRoot;if(a!=null){t=o.relative(a,t)}var u=new i;var s=new i;this._mappings.unsortedForEach((function(r){if(r.source===t&&r.originalLine!=null){var i=e.originalPositionFor({line:r.originalLine,column:r.originalColumn});if(i.source!=null){r.source=i.source;if(n!=null){r.source=o.join(n,r.source)}if(a!=null){r.source=o.relative(a,r.source)}r.originalLine=i.line;r.originalColumn=i.column;if(i.name!=null){r.name=i.name}}}var l=r.source;if(l!=null&&!u.has(l)){u.add(l)}var c=r.name;if(c!=null&&!s.has(c)){s.add(c)}}),this);this._sources=u;this._names=s;e.sources.forEach((function(r){var t=e.sourceContentFor(r);if(t!=null){if(n!=null){r=o.join(n,r)}if(a!=null){r=o.relative(a,r)}this.setSourceContent(r,t)}}),this)};SourceMapGenerator.prototype._validateMapping=function SourceMapGenerator_validateMapping(e,r,n,t){if(r&&typeof r.line!=="number"&&typeof r.column!=="number"){throw new Error("original.line and original.column are not numbers -- you probably meant to omit "+"the original mapping entirely and only map the generated position. If so, pass "+"null for the original mapping instead of an object with empty or null values.")}if(e&&"line"in e&&"column"in e&&e.line>0&&e.column>=0&&!r&&!n&&!t){return}else if(e&&"line"in e&&"column"in e&&r&&"line"in r&&"column"in r&&e.line>0&&e.column>=0&&r.line>0&&r.column>=0&&n){return}else{throw new Error("Invalid mapping: "+JSON.stringify({generated:e,source:n,original:r,name:t}))}};SourceMapGenerator.prototype._serializeMappings=function SourceMapGenerator_serializeMappings(){var e=0;var r=1;var n=0;var i=0;var a=0;var u=0;var s="";var l;var c;var p;var f;var g=this._mappings.toArray();for(var h=0,d=g.length;h0){if(!o.compareByGeneratedPositionsInflated(c,g[h-1])){continue}l+=","}}l+=t.encode(c.generatedColumn-e);e=c.generatedColumn;if(c.source!=null){f=this._sources.indexOf(c.source);l+=t.encode(f-u);u=f;l+=t.encode(c.originalLine-1-i);i=c.originalLine-1;l+=t.encode(c.originalColumn-n);n=c.originalColumn;if(c.name!=null){p=this._names.indexOf(c.name);l+=t.encode(p-a);a=p}}s+=l}return s};SourceMapGenerator.prototype._generateSourcesContent=function SourceMapGenerator_generateSourcesContent(e,r){return e.map((function(e){if(!this._sourcesContents){return null}if(r!=null){e=o.relative(r,e)}var n=o.toSetString(e);return Object.prototype.hasOwnProperty.call(this._sourcesContents,n)?this._sourcesContents[n]:null}),this)};SourceMapGenerator.prototype.toJSON=function SourceMapGenerator_toJSON(){var e={version:this._version,sources:this._sources.toArray(),names:this._names.toArray(),mappings:this._serializeMappings()};if(this._file!=null){e.file=this._file}if(this._sourceRoot!=null){e.sourceRoot=this._sourceRoot}if(this._sourcesContents){e.sourcesContent=this._generateSourcesContent(e.sources,e.sourceRoot)}return e};SourceMapGenerator.prototype.toString=function SourceMapGenerator_toString(){return JSON.stringify(this.toJSON())};r.h=SourceMapGenerator},351:(e,r,n)=>{var t;var o=n(591).h;var i=n(339);var a=/(\r?\n)/;var u=10;var s="$$$isSourceNode$$$";function SourceNode(e,r,n,t,o){this.children=[];this.sourceContents={};this.line=e==null?null:e;this.column=r==null?null:r;this.source=n==null?null:n;this.name=o==null?null:o;this[s]=true;if(t!=null)this.add(t)}SourceNode.fromStringWithSourceMap=function SourceNode_fromStringWithSourceMap(e,r,n){var t=new SourceNode;var o=e.split(a);var u=0;var shiftNextLine=function(){var e=getNextLine();var r=getNextLine()||"";return e+r;function getNextLine(){return u=0;r--){this.prepend(e[r])}}else if(e[s]||typeof e==="string"){this.children.unshift(e)}else{throw new TypeError("Expected a SourceNode, string, or an array of SourceNodes and strings. Got "+e)}return this};SourceNode.prototype.walk=function SourceNode_walk(e){var r;for(var n=0,t=this.children.length;n0){r=[];for(n=0;n{function getArg(e,r,n){if(r in e){return e[r]}else if(arguments.length===3){return n}else{throw new Error('"'+r+'" is a required argument.')}}r.getArg=getArg;var n=/^(?:([\w+\-.]+):)?\/\/(?:(\w+:\w+)@)?([\w.-]*)(?::(\d+))?(.*)$/;var t=/^data:.+\,.+$/;function urlParse(e){var r=e.match(n);if(!r){return null}return{scheme:r[1],auth:r[2],host:r[3],port:r[4],path:r[5]}}r.urlParse=urlParse;function urlGenerate(e){var r="";if(e.scheme){r+=e.scheme+":"}r+="//";if(e.auth){r+=e.auth+"@"}if(e.host){r+=e.host}if(e.port){r+=":"+e.port}if(e.path){r+=e.path}return r}r.urlGenerate=urlGenerate;function normalize(e){var n=e;var t=urlParse(e);if(t){if(!t.path){return e}n=t.path}var o=r.isAbsolute(n);var i=n.split(/\/+/);for(var a,u=0,s=i.length-1;s>=0;s--){a=i[s];if(a==="."){i.splice(s,1)}else if(a===".."){u++}else if(u>0){if(a===""){i.splice(s+1,u);u=0}else{i.splice(s,2);u--}}}n=i.join("/");if(n===""){n=o?"/":"."}if(t){t.path=n;return urlGenerate(t)}return n}r.normalize=normalize;function join(e,r){if(e===""){e="."}if(r===""){r="."}var n=urlParse(r);var o=urlParse(e);if(o){e=o.path||"/"}if(n&&!n.scheme){if(o){n.scheme=o.scheme}return urlGenerate(n)}if(n||r.match(t)){return r}if(o&&!o.host&&!o.path){o.host=r;return urlGenerate(o)}var i=r.charAt(0)==="/"?r:normalize(e.replace(/\/+$/,"")+"/"+r);if(o){o.path=i;return urlGenerate(o)}return i}r.join=join;r.isAbsolute=function(e){return e.charAt(0)==="/"||n.test(e)};function relative(e,r){if(e===""){e="."}e=e.replace(/\/$/,"");var n=0;while(r.indexOf(e+"/")!==0){var t=e.lastIndexOf("/");if(t<0){return r}e=e.slice(0,t);if(e.match(/^([^\/]+:\/)?\/*$/)){return r}++n}return Array(n+1).join("../")+r.substr(e.length+1)}r.relative=relative;var o=function(){var e=Object.create(null);return!("__proto__"in e)}();function identity(e){return e}function toSetString(e){if(isProtoString(e)){return"$"+e}return e}r.toSetString=o?identity:toSetString;function fromSetString(e){if(isProtoString(e)){return e.slice(1)}return e}r.fromSetString=o?identity:fromSetString;function isProtoString(e){if(!e){return false}var r=e.length;if(r<9){return false}if(e.charCodeAt(r-1)!==95||e.charCodeAt(r-2)!==95||e.charCodeAt(r-3)!==111||e.charCodeAt(r-4)!==116||e.charCodeAt(r-5)!==111||e.charCodeAt(r-6)!==114||e.charCodeAt(r-7)!==112||e.charCodeAt(r-8)!==95||e.charCodeAt(r-9)!==95){return false}for(var n=r-10;n>=0;n--){if(e.charCodeAt(n)!==36){return false}}return true}function compareByOriginalPositions(e,r,n){var t=strcmp(e.source,r.source);if(t!==0){return t}t=e.originalLine-r.originalLine;if(t!==0){return t}t=e.originalColumn-r.originalColumn;if(t!==0||n){return t}t=e.generatedColumn-r.generatedColumn;if(t!==0){return t}t=e.generatedLine-r.generatedLine;if(t!==0){return t}return strcmp(e.name,r.name)}r.compareByOriginalPositions=compareByOriginalPositions;function compareByGeneratedPositionsDeflated(e,r,n){var t=e.generatedLine-r.generatedLine;if(t!==0){return t}t=e.generatedColumn-r.generatedColumn;if(t!==0||n){return t}t=strcmp(e.source,r.source);if(t!==0){return t}t=e.originalLine-r.originalLine;if(t!==0){return t}t=e.originalColumn-r.originalColumn;if(t!==0){return t}return strcmp(e.name,r.name)}r.compareByGeneratedPositionsDeflated=compareByGeneratedPositionsDeflated;function strcmp(e,r){if(e===r){return 0}if(e===null){return 1}if(r===null){return-1}if(e>r){return 1}return-1}function compareByGeneratedPositionsInflated(e,r){var n=e.generatedLine-r.generatedLine;if(n!==0){return n}n=e.generatedColumn-r.generatedColumn;if(n!==0){return n}n=strcmp(e.source,r.source);if(n!==0){return n}n=e.originalLine-r.originalLine;if(n!==0){return n}n=e.originalColumn-r.originalColumn;if(n!==0){return n}return strcmp(e.name,r.name)}r.compareByGeneratedPositionsInflated=compareByGeneratedPositionsInflated;function parseSourceMapInput(e){return JSON.parse(e.replace(/^\)]}'[^\n]*\n/,""))}r.parseSourceMapInput=parseSourceMapInput;function computeSourceURL(e,r,n){r=r||"";if(e){if(e[e.length-1]!=="/"&&r[0]!=="/"){e+="/"}r=e+r}if(n){var t=urlParse(n);if(!t){throw new Error("sourceMapURL could not be parsed")}if(t.path){var o=t.path.lastIndexOf("/");if(o>=0){t.path=t.path.substring(0,o+1)}}r=join(urlGenerate(t),r)}return normalize(r)}r.computeSourceURL=computeSourceURL},997:(e,r,n)=>{n(591).h;r.SourceMapConsumer=n(952).SourceMapConsumer;n(351)},284:(e,r,n)=>{e=n.nmd(e);var t=n(997).SourceMapConsumer;var o=n(17);var i;try{i=n(147);if(!i.existsSync||!i.readFileSync){i=null}}catch(e){}var a=n(650);function dynamicRequire(e,r){return e.require(r)}var u=false;var s=false;var l=false;var c="auto";var p={};var f={};var g=/^data:application\/json[^,]+base64,/;var h=[];var d=[];function isInBrowser(){if(c==="browser")return true;if(c==="node")return false;return typeof window!=="undefined"&&typeof XMLHttpRequest==="function"&&!(window.require&&window.module&&window.process&&window.process.type==="renderer")}function hasGlobalProcessEventEmitter(){return typeof process==="object"&&process!==null&&typeof process.on==="function"}function globalProcessVersion(){if(typeof process==="object"&&process!==null){return process.version}else{return""}}function globalProcessStderr(){if(typeof process==="object"&&process!==null){return process.stderr}}function globalProcessExit(e){if(typeof process==="object"&&process!==null&&typeof process.exit==="function"){return process.exit(e)}}function handlerExec(e){return function(r){for(var n=0;n"}var n=this.getLineNumber();if(n!=null){r+=":"+n;var t=this.getColumnNumber();if(t){r+=":"+t}}}var o="";var i=this.getFunctionName();var a=true;var u=this.isConstructor();var s=!(this.isToplevel()||u);if(s){var l=this.getTypeName();if(l==="[object Object]"){l="null"}var c=this.getMethodName();if(i){if(l&&i.indexOf(l)!=0){o+=l+"."}o+=i;if(c&&i.indexOf("."+c)!=i.length-c.length-1){o+=" [as "+c+"]"}}else{o+=l+"."+(c||"")}}else if(u){o+="new "+(i||"")}else if(i){o+=i}else{o+=r;a=false}if(a){o+=" ("+r+")"}return o}function cloneCallSite(e){var r={};Object.getOwnPropertyNames(Object.getPrototypeOf(e)).forEach((function(n){r[n]=/^(?:is|get)/.test(n)?function(){return e[n].call(e)}:e[n]}));r.toString=CallSiteToString;return r}function wrapCallSite(e,r){if(r===undefined){r={nextPosition:null,curPosition:null}}if(e.isNative()){r.curPosition=null;return e}var n=e.getFileName()||e.getScriptNameOrSourceURL();if(n){var t=e.getLineNumber();var o=e.getColumnNumber()-1;var i=/^v(10\.1[6-9]|10\.[2-9][0-9]|10\.[0-9]{3,}|1[2-9]\d*|[2-9]\d|\d{3,}|11\.11)/;var a=i.test(globalProcessVersion())?0:62;if(t===1&&o>a&&!isInBrowser()&&!e.isEval()){o-=a}var u=mapSourcePosition({source:n,line:t,column:o});r.curPosition=u;e=cloneCallSite(e);var s=e.getFunctionName;e.getFunctionName=function(){if(r.nextPosition==null){return s()}return r.nextPosition.name||s()};e.getFileName=function(){return u.source};e.getLineNumber=function(){return u.line};e.getColumnNumber=function(){return u.column+1};e.getScriptNameOrSourceURL=function(){return u.source};return e}var l=e.isEval()&&e.getEvalOrigin();if(l){l=mapEvalOrigin(l);e=cloneCallSite(e);e.getEvalOrigin=function(){return l};return e}return e}function prepareStackTrace(e,r){if(l){p={};f={}}var n=e.name||"Error";var t=e.message||"";var o=n+": "+t;var i={nextPosition:null,curPosition:null};var a=[];for(var u=r.length-1;u>=0;u--){a.push("\n at "+wrapCallSite(r[u],i));i.nextPosition=i.curPosition}i.curPosition=i.nextPosition=null;return o+a.reverse().join("")}function getErrorSource(e){var r=/\n at [^(]+ \((.*):(\d+):(\d+)\)/.exec(e.stack);if(r){var n=r[1];var t=+r[2];var o=+r[3];var a=p[n];if(!a&&i&&i.existsSync(n)){try{a=i.readFileSync(n,"utf8")}catch(e){a=""}}if(a){var u=a.split(/(?:\r\n|\r|\n)/)[t-1];if(u){return n+":"+t+"\n"+u+"\n"+new Array(o).join(" ")+"^"}}}return null}function printErrorAndExit(e){var r=getErrorSource(e);var n=globalProcessStderr();if(n&&n._handle&&n._handle.setBlocking){n._handle.setBlocking(true)}if(r){console.error();console.error(r)}console.error(e.stack);globalProcessExit(1)}function shimEmitUncaughtException(){var e=process.emit;process.emit=function(r){if(r==="uncaughtException"){var n=arguments[1]&&arguments[1].stack;var t=this.listeners(r).length>0;if(n&&!t){return printErrorAndExit(arguments[1])}}return e.apply(this,arguments)}}var S=h.slice(0);var _=d.slice(0);r.wrapCallSite=wrapCallSite;r.getErrorSource=getErrorSource;r.mapSourcePosition=mapSourcePosition;r.retrieveSourceMap=v;r.install=function(r){r=r||{};if(r.environment){c=r.environment;if(["node","browser","auto"].indexOf(c)===-1){throw new Error("environment "+c+" was unknown. Available options are {auto, browser, node}")}}if(r.retrieveFile){if(r.overrideRetrieveFile){h.length=0}h.unshift(r.retrieveFile)}if(r.retrieveSourceMap){if(r.overrideRetrieveSourceMap){d.length=0}d.unshift(r.retrieveSourceMap)}if(r.hookRequire&&!isInBrowser()){var n=dynamicRequire(e,"module");var t=n.prototype._compile;if(!t.__sourceMapSupport){n.prototype._compile=function(e,r){p[r]=e;f[r]=undefined;return t.call(this,e,r)};n.prototype._compile.__sourceMapSupport=true}}if(!l){l="emptyCacheBetweenOperations"in r?r.emptyCacheBetweenOperations:false}if(!u){u=true;Error.prepareStackTrace=prepareStackTrace}if(!s){var o="handleUncaughtExceptions"in r?r.handleUncaughtExceptions:true;try{var i=dynamicRequire(e,"worker_threads");if(i.isMainThread===false){o=false}}catch(e){}if(o&&hasGlobalProcessEventEmitter()){s=true;shimEmitUncaughtException()}}};r.resetRetrieveHandlers=function(){h.length=0;d.length=0;h=S.slice(0);d=_.slice(0);v=handlerExec(d);m=handlerExec(h)}},147:e=>{"use strict";e.exports=require("fs")},17:e=>{"use strict";e.exports=require("path")}};var r={};function __webpack_require__(n){var t=r[n];if(t!==undefined){return t.exports}var o=r[n]={id:n,loaded:false,exports:{}};var i=true;try{e[n](o,o.exports,__webpack_require__);i=false}finally{if(i)delete r[n]}o.loaded=true;return o.exports}(()=>{__webpack_require__.nmd=e=>{e.paths=[];if(!e.children)e.children=[];return e}})();if(typeof __webpack_require__!=="undefined")__webpack_require__.ab=__dirname+"/";var n={};(()=>{__webpack_require__(284).install()})();module.exports=n})(); -------------------------------------------------------------------------------- /cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algolia/algoliasearch-crawler-github-actions/81a0bd9e7956d411f3321d7d5af970117403a8b3/cover.jpg -------------------------------------------------------------------------------- /examples/basic.yml: -------------------------------------------------------------------------------- 1 | name: Algolia Recrawl Example 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | types: ['opened', 'edited', 'reopened', 'synchronize'] 8 | 9 | jobs: 10 | algolia_crawl: 11 | name: Algolia Recrawl 12 | runs-on: ubuntu-latest 13 | steps: 14 | # checkout this repo 15 | - name: Checkout Repo 16 | uses: actions/checkout@v2 17 | 18 | - name: Algolia crawler creation and crawl 19 | uses: algolia/algoliasearch-crawler-github-actions@v1 20 | id: algolia_crawler 21 | with: # mandatory parameters 22 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 23 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 24 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 25 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 26 | site-url: 'https://crawler.algolia.com/test-website/' 27 | -------------------------------------------------------------------------------- /examples/github_pages.yml: -------------------------------------------------------------------------------- 1 | name: Github Pages -> Algolia Crawler 2 | on: 3 | push: 4 | branches: [ main ] 5 | 6 | jobs: 7 | algolia_recrawl: 8 | name: Algolia Recrawl 9 | runs-on: ubuntu-latest 10 | steps: 11 | # checkout this repo 12 | - name: Checkout Repo 13 | uses: actions/checkout@v2 14 | 15 | # We don't know when the site will be deployed, we just wait a few seconds 16 | # Better solutions can be found 17 | - name: Sleep for 30s 18 | run: sleep 30 19 | 20 | - name: Github-pages-MAIN => Algolia crawler creation and recrawl (Push on Main branch) 21 | uses: algolia/algoliasearch-crawler-github-actions@v1 22 | id: crawler_push 23 | with: 24 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 25 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 26 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 27 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 28 | site-url: 'https://community.algolia.com/algoliasearch-crawler-github-actions/public/github-pages/' 29 | override-config: true 30 | -------------------------------------------------------------------------------- /examples/netlify.yml: -------------------------------------------------------------------------------- 1 | name: Netlify -> Algolia Crawler 2 | on: 3 | push: 4 | branches: [ main ] 5 | pull_request: 6 | types: ['opened', 'edited', 'reopened', 'synchronize'] 7 | 8 | jobs: 9 | algolia_recrawl: 10 | name: Algolia Recrawl 11 | runs-on: ubuntu-latest 12 | steps: 13 | # checkout this repo 14 | - name: Checkout Repo 15 | uses: actions/checkout@v2 16 | 17 | # We don't know when the site will be deployed, we just wait a few seconds 18 | # Better solutions can be found 19 | - name: Sleep for 30s 20 | run: sleep 30 21 | 22 | # For PRs 23 | - name: Netlify-PR => Algolia crawler creation and recrawl on preview (Pull Request) 24 | if: github.ref != 'refs/heads/main' 25 | uses: algolia/algoliasearch-crawler-github-actions@v1 26 | id: crawler_pr 27 | with: 28 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 29 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 30 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 31 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 32 | site-url: 'https://deploy-preview-${{ github.event.pull_request.number }}--algolia-ga-actions-netlify.netlify.app/' 33 | override-config: true 34 | 35 | # For main branch 36 | - name: Netlify-MAIN => Algolia crawler creation and recrawl (Push on Main branch) 37 | if: github.ref == 'refs/heads/main' 38 | uses: algolia/algoliasearch-crawler-github-actions@v1 39 | id: crawler_push 40 | with: 41 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 42 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 43 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 44 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 45 | site-url: 'https://algolia-ga-actions-netlify.netlify.app/' 46 | override-config: true 47 | 48 | -------------------------------------------------------------------------------- /examples/vercel_pr.yml: -------------------------------------------------------------------------------- 1 | name: Vercel -> Algolia Crawler (PR) 2 | on: 3 | pull_request: 4 | types: ['opened', 'edited', 'reopened', 'synchronize'] 5 | 6 | jobs: 7 | algolia_recrawl: 8 | name: Algolia Recrawl 9 | runs-on: ubuntu-latest 10 | steps: 11 | # checkout this repo 12 | - name: Checkout Repo 13 | uses: actions/checkout@v2 14 | 15 | # Get url directly from vercel 16 | - name: Get deployment URL 17 | id: deployment 18 | uses: dorshinar/get-deployment-url@master 19 | timeout-minutes: 1 20 | with: 21 | token: ${{ github.token }} 22 | 23 | - name: Vercel-PR => Algolia crawler creation and recrawl on preview (Pull Request) 24 | uses: algolia/algoliasearch-crawler-github-actions@v1 25 | id: crawler_pr 26 | with: 27 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 28 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 29 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 30 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 31 | site-url: ${{ steps.deployment.outputs.deployment }} 32 | override-config: true 33 | -------------------------------------------------------------------------------- /examples/vercel_push.yml: -------------------------------------------------------------------------------- 1 | name: Vercel -> Algolia Crawler (push on main) 2 | on: 3 | push: 4 | branches: [ main ] 5 | 6 | jobs: 7 | algolia_recrawl: 8 | name: Algolia Recrawl 9 | runs-on: ubuntu-latest 10 | steps: 11 | # checkout this repo 12 | - name: Checkout Repo 13 | uses: actions/checkout@v2 14 | 15 | - name: Vercel-MAIN => Algolia crawler creation and recrawl on preview (Push on Main branch) 16 | uses: algolia/algoliasearch-crawler-github-actions@v1 17 | id: crawler_push 18 | with: 19 | crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} 20 | crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} 21 | algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} 22 | algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} 23 | site-url: 'https://crawler.algolia.com/test-website/' 24 | override-config: true 25 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: 'ts-jest/presets/default-esm', 3 | testEnvironment: 'node', 4 | testMatch: ['/src/**/*.test.ts'], 5 | testPathIgnorePatterns: ['/node_modules/', '/dist/'], 6 | maxWorkers: 1, 7 | globals: { 8 | 'ts-jest': { 9 | useESM: true, 10 | }, 11 | }, 12 | }; 13 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "private": true, 3 | "name": "algoliasearch-crawler-github-actions", 4 | "version": "1.1.13", 5 | "description": "Github action for Algolia crawler indexing", 6 | "exports": "./build/index.js", 7 | "scripts": { 8 | "build": "yarn clean && tsc", 9 | "clean": "rm -rf dist/ build/", 10 | "test": "jest", 11 | "lint": "eslint --ext=jsx,ts,tsx,js src/", 12 | "semantic-release": "semantic-release", 13 | "compile": "ncc build ./dist/index.js -o ./build -m -s", 14 | "hot:runtime": "tsc -b -w --preserveWatchOutput" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/algolia/algoliasearch-crawler-github-actions.git" 19 | }, 20 | "keywords": [ 21 | "Crawler", 22 | "Algolia", 23 | "Github", 24 | "Action", 25 | "indexing", 26 | "crawl", 27 | "algoliasearch", 28 | "search" 29 | ], 30 | "author": { 31 | "name": "Algolia, Inc.", 32 | "url": "https://www.algolia.com" 33 | }, 34 | "license": "MIT", 35 | "bugs": { 36 | "url": "https://github.com/algolia/algoliasearch-crawler-github-actions/issues" 37 | }, 38 | "engines": { 39 | "node": "^16.14.2" 40 | }, 41 | "homepage": "https://github.com/algolia/algoliasearch-crawler-github-actions#readme", 42 | "devDependencies": { 43 | "@semantic-release/changelog": "6.0.2", 44 | "@semantic-release/exec": "6.0.3", 45 | "@semantic-release/git": "10.0.1", 46 | "@semantic-release/npm": "9.0.1", 47 | "@types/cheerio": "0.22.35", 48 | "@types/jest": "27.5.2", 49 | "@types/node": "16.18.16", 50 | "@types/node-fetch": "2.6.2", 51 | "@typescript-eslint/eslint-plugin": "5.44.0", 52 | "@typescript-eslint/parser": "5.44.0", 53 | "@vercel/ncc": "0.38.3", 54 | "eslint": "8.57.1", 55 | "eslint-config-algolia": "20.1.0", 56 | "eslint-config-prettier": "8.10.0", 57 | "eslint-plugin-eslint-comments": "3.2.0", 58 | "eslint-plugin-import": "2.26.0", 59 | "eslint-plugin-jest": "26.9.0", 60 | "eslint-plugin-jsdoc": "39.9.1", 61 | "eslint-plugin-prettier": "4.2.1", 62 | "jest": "27.5.1", 63 | "prettier": "2.8.2", 64 | "semantic-release": "19.0.5", 65 | "ts-jest": "27.1.5", 66 | "ts-node": "10.9.2", 67 | "typescript": "4.9.5" 68 | }, 69 | "dependencies": { 70 | "@actions/core": "1.11.1", 71 | "@actions/github": "5.1.1", 72 | "node-fetch": "2.6.7" 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /public/github-pages/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | First test page 5 | 6 | 7 | 8 |

First test page

9 |

This is the contents of the first test page.

10 | 11 | 12 | -------------------------------------------------------------------------------- /public/github-pages/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Second test page 5 | 6 | 7 | 8 |

Second test page

9 |

This is the contents of the second test page.

10 | 11 | 12 | -------------------------------------------------------------------------------- /public/github-pages/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Algoliasearch Github Pages Test Website 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Algoliasearch Github Actions Test Website

13 | 14 |

Test content

15 | 16 |

Some content to index.

17 |

Links to other pages: 18 |

26 |

27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /public/netlify/1.html: -------------------------------------------------------------------------------- 1 | lines (11 sloc) 259 Bytes 2 | 3 | 4 | 5 | First test page 6 | 7 | 8 | 9 |

First test page

10 |

This is the contents of the first test page

11 | 12 | 13 | -------------------------------------------------------------------------------- /public/netlify/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Second test page 5 | 6 | 7 | 8 |

Second test page

9 |

This is the contents of the second test page.

10 | 11 | 12 | -------------------------------------------------------------------------------- /public/netlify/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Algoliasearch Netlify Test Website 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Algoliasearch Netlify Test Website

13 | 14 |

Test content

15 | 16 |

Some content to index.

17 |

Links to other pages: 18 |

26 |

27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /public/vercel/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | First test page 5 | 6 | 7 | 8 |

First test page

9 |

This is the contents of the first test page.

10 | 11 | 12 | -------------------------------------------------------------------------------- /public/vercel/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Second test page 5 | 6 | 7 | 8 |

Second test page

9 |

This is the contents of the second test page.

10 | 11 | 12 | -------------------------------------------------------------------------------- /public/vercel/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Algoliasearch Vercel Test Website 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Algoliasearch Vercel Test Website

13 | 14 |

Test content

15 | 16 |

Some content to index.

17 |

Links to other pages: 18 |

26 |

27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /release.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-template-curly-in-string */ 2 | module.exports = { 3 | branches: 'main', 4 | verifyConditions: ['@semantic-release/github'], 5 | prepare: [ 6 | { 7 | path: '@semantic-release/changelog', 8 | changelogFile: 'CHANGELOG.md', 9 | }, 10 | '@semantic-release/npm', 11 | { 12 | path: '@semantic-release/exec', 13 | prepareCmd: 'yarn compile', 14 | }, 15 | { 16 | path: '@semantic-release/git', 17 | assets: [ 18 | 'package.json', 19 | 'CHANGELOG.md', 20 | 'build/index.js', 21 | 'build/package.json', 22 | ], 23 | message: 24 | 'chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}', 25 | }, 26 | ], 27 | publish: ['@semantic-release/github'], 28 | success: [], 29 | fail: [], 30 | npmPublish: false, 31 | }; 32 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:js-app", 4 | "algolia", 5 | ":prHourlyLimitNone" 6 | ], 7 | "dependencyDashboard": true, 8 | "semanticCommitType": "chore" 9 | } 10 | -------------------------------------------------------------------------------- /src/crawler-api-client.ts: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch'; 2 | import type { Response } from 'node-fetch'; 3 | 4 | // eslint-disable-next-line @typescript-eslint/prefer-ts-expect-error 5 | // @ts-ignore 6 | import { version } from '../package.json'; 7 | 8 | import type { ConfigJson } from './types/configJson'; 9 | import type { 10 | GetCrawlersResponseBody, 11 | CreatedCrawlerResponseBody, 12 | UpdateConfigResponseBody, 13 | CrawlerStatusResponseBody, 14 | GetUrlStatsResponseBody, 15 | TaskResponseBody, 16 | UrlTestResponseBody, 17 | } from './types/publicApiJsonResponses'; 18 | 19 | type SearchParams = { [key: string]: boolean | number | string }; 20 | 21 | export interface ClientParams { 22 | crawlerUserId: string; 23 | crawlerApiBaseUrl: string; 24 | crawlerApiKey: string; 25 | } 26 | 27 | export interface CrawlerParams { 28 | id: string; 29 | name: string; 30 | jsonConfig: ConfigJson; 31 | } 32 | 33 | export interface ActionParams { 34 | crawlerId: string; 35 | actionName: string; 36 | } 37 | 38 | export interface TaskParams { 39 | crawlerId: string; 40 | taskId: string; 41 | } 42 | 43 | export interface TestUrlParams { 44 | crawlerId: string; 45 | url: string; 46 | config?: JSON; 47 | } 48 | 49 | const USER_AGENT = `algolia_crawler_github_actions/${version}`; 50 | 51 | /** 52 | * Example of class that can be used to hit the Crawler API. 53 | * 54 | * @example 55 | * const client = new CrawlerApiClient({ 56 | * crawlerApiBaseUrl: 'https://crawler.algolia.com/api/1/', 57 | * crawlerUserId: 'test_user@algolia.com', 58 | * crawlerApiKey: 'crawler_api_key' 59 | * }); 60 | * await client.reindex('crawler_id'); 61 | */ 62 | export class CrawlerApiClient { 63 | crawlerUserId: string; 64 | crawlerApiKey: string; 65 | crawlerApiBaseUrl: string; 66 | 67 | constructor({ 68 | crawlerUserId, 69 | crawlerApiBaseUrl, 70 | crawlerApiKey, 71 | }: ClientParams) { 72 | this.crawlerUserId = crawlerUserId; 73 | this.crawlerApiKey = crawlerApiKey; 74 | this.crawlerApiBaseUrl = crawlerApiBaseUrl; 75 | } 76 | 77 | /** 78 | * Get Basic Auth token, base64 encoded. 79 | * 80 | * @returns - Basic Auth Token. 81 | */ 82 | get basicAuthToken(): string { 83 | return `Basic ${Buffer.from( 84 | `${this.crawlerUserId}:${this.crawlerApiKey}` 85 | ).toString('base64')}`; 86 | } 87 | 88 | static async __handleResponse(res: Response): Promise { 89 | if (res.ok) { 90 | try { 91 | return (await res.json()) as TBody; 92 | } catch (err) { 93 | // eslint-disable-next-line no-console 94 | console.log('Body', await res.text()); 95 | throw new Error('Cant decode success body'); 96 | } 97 | } 98 | 99 | const body = await res.text(); 100 | throw new Error(`${res.status}: ${res.statusText}\n${body}`); 101 | } 102 | 103 | /** 104 | * Create a new Crawler. 105 | * 106 | * @param name - The crawler's name. 107 | * @param jsonConfig - The crawler configuration, in JSON format. 108 | * @returns A promise that will resolve with an object containing the crawler's id: `{ id: 'crawler_id' }`. 109 | */ 110 | async createCrawler( 111 | name: string, 112 | jsonConfig: ConfigJson 113 | ): Promise { 114 | const body = { 115 | name, 116 | config: jsonConfig, 117 | }; 118 | const res = await fetch(`${this.crawlerApiBaseUrl}/crawlers`, { 119 | method: 'POST', 120 | headers: { 121 | Authorization: this.basicAuthToken, 122 | 'Content-Type': 'application/json', 123 | 'User-Agent': USER_AGENT, 124 | }, 125 | body: JSON.stringify(body), 126 | }); 127 | 128 | return CrawlerApiClient.__handleResponse(res); 129 | } 130 | 131 | /** 132 | * Update a Crawler. 133 | * 134 | * @param p - Params. 135 | * @param p.id - Identifier of the crawler to update. 136 | * @param p.name - (optional) The new name of the crawler. 137 | * @param p.jsonConfig - (optional) The new configuration of the crawler. It must be a complete config as it 138 | * will completely override the existing one. 139 | * @returns A promise that will resolve with an object containing a taskId: `{ taskId: 'task_id' }`. 140 | */ 141 | async updateCrawler({ 142 | id, 143 | name, 144 | jsonConfig, 145 | }: CrawlerParams): Promise { 146 | const body = { 147 | name, 148 | config: jsonConfig, 149 | }; 150 | const res = await fetch(`${this.crawlerApiBaseUrl}/crawlers/${id}`, { 151 | method: 'PATCH', 152 | headers: { 153 | Authorization: this.basicAuthToken, 154 | 'Content-Type': 'application/json', 155 | 'User-Agent': USER_AGENT, 156 | }, 157 | body: JSON.stringify(body), 158 | }); 159 | return CrawlerApiClient.__handleResponse(res); 160 | } 161 | 162 | /** 163 | * List all Crawlers. 164 | * 165 | * @param p - Params. 166 | * @param p.itemsPerPage - The number of crawlers to return per page. 167 | * @param p.page - The page to fetch. 168 | * @param p.name - Name of the crawler to get. 169 | * @param p.appId - Application of the crawlers to get. 170 | * @returns A promise that will resolve with an object looking like: 171 | * { 172 | * items: [{ id: 'crawler_1_id', name: 'crawler_1_name' }, { id: 'crawler_2_id, ... }], 173 | * itemsPerPage: 20, 174 | * page: 1, 175 | * total: 5 176 | * } 177 | * . 178 | */ 179 | async getCrawlers({ 180 | itemsPerPage, 181 | page, 182 | name, 183 | appId, 184 | }: { 185 | itemsPerPage?: number; 186 | page?: number; 187 | name?: string; 188 | appId?: string; 189 | }): Promise { 190 | const searchParams: SearchParams = {}; 191 | if (itemsPerPage) searchParams.itemsPerPage = itemsPerPage; 192 | if (page) searchParams.page = page; 193 | if (name) searchParams.name = name; 194 | if (appId) searchParams.appId = appId; 195 | const qs = Object.keys(searchParams) 196 | .map( 197 | (k) => `${encodeURIComponent(k)}=${encodeURIComponent(searchParams[k])}` 198 | ) 199 | .join('&'); 200 | const res = await fetch( 201 | `${this.crawlerApiBaseUrl}/crawlers${qs ? `?${qs}` : ''}`, 202 | { 203 | headers: { 204 | Authorization: this.basicAuthToken, 205 | 'User-Agent': USER_AGENT, 206 | }, 207 | } 208 | ); 209 | return CrawlerApiClient.__handleResponse(res); 210 | } 211 | 212 | /** 213 | * Update a Crawler's configuration. 214 | * 215 | * @param id - Identifier of the crawler configuration to update. 216 | * @param partialJsonConfig - The config object that will be merged with the current configuration. 217 | * @example 218 | * The merge will be done on top-level properties: 219 | * const newConfig = { 220 | * ...currentConfigInDB, 221 | * ...partialJsonConfig, 222 | * } 223 | * @returns A promise that will resolve with an object containing a taskId: `{ taskId: 'task_id' }`. 224 | */ 225 | async updateConfig( 226 | id: string, 227 | partialJsonConfig: ConfigJson 228 | ): Promise { 229 | const res = await fetch(`${this.crawlerApiBaseUrl}/crawlers/${id}/config`, { 230 | method: 'PATCH', 231 | headers: { 232 | Authorization: this.basicAuthToken, 233 | 'Content-Type': 'application/json', 234 | 'User-Agent': USER_AGENT, 235 | }, 236 | body: JSON.stringify(partialJsonConfig), 237 | }); 238 | return CrawlerApiClient.__handleResponse(res); 239 | } 240 | 241 | /** 242 | * Get the crawler's configuration. 243 | * 244 | * @param id - Identifier of the Crawler. 245 | * @returns A promise that will resolve with the crawler's config (in JSON format). 246 | */ 247 | async getConfig(id: string): Promise { 248 | const res = await fetch( 249 | `${this.crawlerApiBaseUrl}/crawlers/${id}?withConfig=true`, 250 | { 251 | headers: { 252 | Authorization: this.basicAuthToken, 253 | }, 254 | } 255 | ); 256 | /* const { config } = await CrawlerApiClient.__handleResponse(res); 257 | return config; */ 258 | return CrawlerApiClient.__handleResponse(res); 259 | } 260 | 261 | /** 262 | * Get the status of a crawler. 263 | * 264 | * @param id - The id of the crawler. 265 | * @returns A promise that will resolve with an object containing the status of the crawler. 266 | */ 267 | async getStatus(id: string): Promise { 268 | const res = await fetch(`${this.crawlerApiBaseUrl}/crawlers/${id}`, { 269 | headers: { 270 | Authorization: this.basicAuthToken, 271 | 'User-Agent': USER_AGENT, 272 | }, 273 | }); 274 | return CrawlerApiClient.__handleResponse(res); 275 | } 276 | 277 | /** 278 | * Get statistics of the last reindex a crawler. 279 | * 280 | * @param id - The id of the crawler. 281 | * @returns A promise that will resolve with an object containing some statistics about the last reindex. 282 | */ 283 | async getURLStats(id: string): Promise { 284 | const res = await fetch( 285 | `${this.crawlerApiBaseUrl}/crawlers/${id}/stats/urls`, 286 | { 287 | headers: { 288 | Authorization: this.basicAuthToken, 289 | 'User-Agent': USER_AGENT, 290 | }, 291 | } 292 | ); 293 | return CrawlerApiClient.__handleResponse(res); 294 | } 295 | 296 | /** 297 | * Trigger a reindex on a crawler. 298 | * 299 | * @param id - Identifier of the Crawler. 300 | * @returns A promise that will resolve with an object containing a `taskId`. 301 | */ 302 | async reindex(id: string): Promise { 303 | return await this.__triggerAction({ crawlerId: id, actionName: 'reindex' }); 304 | } 305 | /** 306 | * Trigger a run on a crawler. 307 | * 308 | * @param id - Identifier of the Crawler. 309 | * @returns A promise that will resolve with an object containing a `taskId`. 310 | */ 311 | async run(id: string): Promise { 312 | return await this.__triggerAction({ crawlerId: id, actionName: 'run' }); 313 | } 314 | /** 315 | * Trigger a pause on a crawler. 316 | * 317 | * @param id - Identifier of the Crawler. 318 | * @returns A promise that will resolve with an object containing a `taskId`. 319 | */ 320 | async pause(id: string): Promise { 321 | return await this.__triggerAction({ crawlerId: id, actionName: 'pause' }); 322 | } 323 | 324 | async __triggerAction({ 325 | crawlerId, 326 | actionName, 327 | }: ActionParams): Promise { 328 | const res = await fetch( 329 | `${this.crawlerApiBaseUrl}/crawlers/${crawlerId}/${actionName}`, 330 | { 331 | method: 'POST', 332 | headers: { 333 | Authorization: this.basicAuthToken, 334 | 'Content-Type': 'application/json', 335 | 'User-Agent': USER_AGENT, 336 | }, 337 | } 338 | ); 339 | return CrawlerApiClient.__handleResponse(res); 340 | } 341 | 342 | /** 343 | * Wait for a task to complete. This method will poll the specified crawler every second 344 | * until the given task is not in `pending` state. 345 | * 346 | * @param p - Params. 347 | * @param p.crawlerId - The id of the crawler the task has been triggered on. 348 | * @param p.taskId - The id of the task. 349 | * @returns A promise that will resolve when the task has been executed. 350 | */ 351 | async waitForTaskToComplete({ 352 | crawlerId, 353 | taskId, 354 | }: TaskParams): Promise { 355 | const res = await fetch( 356 | `${this.crawlerApiBaseUrl}/crawlers/${crawlerId}/tasks/${taskId}`, 357 | { 358 | headers: { 359 | Authorization: this.basicAuthToken, 360 | 'User-Agent': USER_AGENT, 361 | }, 362 | } 363 | ); 364 | const { pending } = (await res.json()) as any; 365 | if (pending) { 366 | // console.log(`Task ${taskId} is pending, waiting...`); 367 | await new Promise((resolve) => { 368 | setTimeout(resolve, 1000); 369 | }); 370 | await this.waitForTaskToComplete({ crawlerId, taskId }); 371 | } 372 | } 373 | 374 | /** 375 | * Test a crawler config against an URL. 376 | * 377 | * @param p - Params. 378 | * @param p.crawlerId - The id of the crawler's config to test against. 379 | * @param p.url - The URL to test. 380 | * @param p.config - (optional) A partial configuration, that will be merged with the existing configuration 381 | * before testing the URL (the resulting configuration is only used for the test and not saved in DB). 382 | * This permit you to test modifications on a configuration before saving them. 383 | * @returns A promise that will resolve with an object containing the results of the test. 384 | */ 385 | async testUrl({ 386 | crawlerId, 387 | url, 388 | config, 389 | }: TestUrlParams): Promise { 390 | const res = await fetch( 391 | `${this.crawlerApiBaseUrl}/crawlers/${crawlerId}/test`, 392 | { 393 | method: 'POST', 394 | headers: { 395 | Authorization: this.basicAuthToken, 396 | 'Content-Type': 'application/json', 397 | 'User-Agent': USER_AGENT, 398 | }, 399 | body: JSON.stringify({ url, config }), 400 | } 401 | ); 402 | return (await res.json()) as UrlTestResponseBody; 403 | } 404 | } 405 | -------------------------------------------------------------------------------- /src/helpers.test.ts: -------------------------------------------------------------------------------- 1 | import { CrawlerApiClient } from './crawler-api-client'; 2 | import { getCrawlerId } from './helpers'; 3 | 4 | jest.mock('./crawler-api-client'); 5 | 6 | const CRAWLER_USER_ID = '00000000-0000-4000-a000-000000000001'; 7 | const CRAWLER_API_KEY = '00000000-0000-4000-a000-000000000001'; 8 | const CRAWLER_API_BASE_URL = 'https://crawler.algolia.com/api/1'; 9 | const ALGOLIA_APP_ID = process.env.ALGOLIA_APP_ID!; 10 | 11 | describe('crawlerReindex', () => { 12 | it('should create a crawler if none', async () => { 13 | const client = new CrawlerApiClient({ 14 | crawlerApiBaseUrl: CRAWLER_API_BASE_URL, 15 | crawlerUserId: CRAWLER_USER_ID, 16 | crawlerApiKey: CRAWLER_API_KEY, 17 | }); 18 | const spyGet = jest.spyOn(client, 'getCrawlers'); 19 | spyGet.mockImplementation(() => { 20 | return Promise.resolve({ 21 | items: [], 22 | itemsPerPage: 20, 23 | page: 1, 24 | total: 0, 25 | }); 26 | }); 27 | 28 | const spyCreate = jest.spyOn(client, 'createCrawler'); 29 | spyCreate.mockImplementation(() => { 30 | return Promise.resolve({ 31 | id: 'foobar', 32 | }); 33 | }); 34 | 35 | const id = await getCrawlerId( 36 | { 37 | client, 38 | name: 'test', 39 | override: false, 40 | }, 41 | { 42 | appId: ALGOLIA_APP_ID, 43 | apiKey: CRAWLER_API_KEY, 44 | indexName: 'test', 45 | siteUrl: 'http://localhost', 46 | } 47 | ); 48 | 49 | expect(spyGet).toHaveBeenCalledTimes(1); 50 | expect(id).toBe('foobar'); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /src/helpers.ts: -------------------------------------------------------------------------------- 1 | import * as core from '@actions/core'; 2 | import * as github from '@actions/github'; 3 | import type { GitHub } from '@actions/github/lib/utils'; 4 | 5 | import type { CrawlerApiClient } from './crawler-api-client'; 6 | import type { ConfigJson } from './types/configJson'; 7 | import type { GithubComment } from './types/github'; 8 | 9 | export function getConfig({ 10 | appId, 11 | apiKey, 12 | siteUrl, 13 | indexName, 14 | }: Pick & { 15 | siteUrl: string; 16 | indexName: string; 17 | }): ConfigJson { 18 | return { 19 | appId, 20 | apiKey, 21 | indexPrefix: 'crawler_', 22 | rateLimit: 8, 23 | startUrls: [siteUrl], 24 | ignoreQueryParams: ['source', 'utm_*'], 25 | ignoreNoIndex: false, 26 | ignoreNoFollowTo: false, 27 | ignoreRobotsTxtRules: false, 28 | actions: [ 29 | { 30 | indexName: `${indexName}_index`, 31 | pathsToMatch: [`${siteUrl}**`], 32 | recordExtractor: { 33 | __type: 'function', 34 | source: getRecordExtractorSource(), 35 | }, 36 | }, 37 | ], 38 | }; 39 | } 40 | 41 | function getRecordExtractorSource(): string { 42 | return `({ helpers }) => { 43 | return helpers.netlifyExtractor({ template: 'default' }); 44 | }`; 45 | } 46 | 47 | function findCommentPredicate( 48 | crawlerId: string, 49 | comment: GithubComment 50 | ): boolean { 51 | return ( 52 | (comment.user ? comment.user.login === 'github-actions[bot]' : false) && 53 | (comment.body ? comment.body.includes(crawlerId) : false) 54 | ); 55 | } 56 | 57 | async function findComment({ 58 | octokit, 59 | prNumber, 60 | crawlerId, 61 | }: { 62 | octokit: InstanceType; 63 | prNumber: number; 64 | crawlerId: string; 65 | }): Promise { 66 | const parameters = { 67 | owner: github.context.repo.owner, 68 | repo: github.context.repo.repo, 69 | issue_number: prNumber, 70 | }; 71 | 72 | for await (const { data: comments } of octokit.paginate.iterator( 73 | octokit.rest.issues.listComments, 74 | parameters 75 | )) { 76 | // Search each page for the comment 77 | const gaComment = comments.find((comment) => 78 | findCommentPredicate(crawlerId, comment) 79 | ); 80 | if (gaComment) { 81 | return gaComment; 82 | } 83 | } 84 | 85 | return undefined; 86 | } 87 | 88 | export async function addComment({ 89 | octokit, 90 | crawlerApiBaseUrl, 91 | crawlerId, 92 | appId, 93 | name, 94 | }: { 95 | octokit: InstanceType; 96 | crawlerApiBaseUrl: string; 97 | crawlerId: string; 98 | appId: string; 99 | name: string; 100 | }): Promise { 101 | try { 102 | const context = github.context; 103 | if (context.payload.pull_request === undefined) { 104 | core.info('No pull request found.'); 105 | return; 106 | } 107 | 108 | const prNumber = context.payload.pull_request.number; 109 | 110 | // First check if the comment doesn't already exist 111 | const comment = await findComment({ octokit, prNumber, crawlerId }); 112 | 113 | const pathArray = crawlerApiBaseUrl.split('/'); 114 | const protocol = pathArray[0]; 115 | const host = pathArray[2]; 116 | const baseUrl = `${protocol}//${host}`; 117 | 118 | const message = `

Check your created Crawler

119 |

Check your created index on your Algolia Application

`; 120 | 121 | // If the comment exists, we update it 122 | if (comment !== undefined) { 123 | core.info('Existing comment found.'); 124 | await octokit.rest.issues.updateComment({ 125 | ...context.repo, 126 | comment_id: comment.id, 127 | body: message, 128 | }); 129 | core.info(`Updated comment id '${comment.id}'.`); 130 | return; 131 | } 132 | 133 | octokit.rest.issues.createComment({ 134 | ...context.repo, 135 | issue_number: prNumber, 136 | body: message, 137 | }); 138 | } catch (error) { 139 | let errorMessage = 'An unexpected error happened.'; 140 | 141 | if (error instanceof Error) { 142 | errorMessage = error.message; 143 | } else { 144 | // eslint-disable-next-line no-console 145 | console.log(error); 146 | } 147 | 148 | core.setFailed(errorMessage); 149 | } 150 | } 151 | 152 | export async function getCrawlerId( 153 | { 154 | client, 155 | name, 156 | override, 157 | }: { 158 | client: CrawlerApiClient; 159 | name: string; 160 | override: boolean; 161 | }, 162 | config: Pick & { 163 | siteUrl: string; 164 | indexName: string; 165 | } 166 | ): Promise { 167 | // Searching for the crawler, based on the name and application ID 168 | const crawlers = await client.getCrawlers({ 169 | name, 170 | appId: config.appId, 171 | }); 172 | 173 | if (crawlers.items.length > 0) { 174 | // If the crawler exists : update it 175 | const crawlerId = crawlers.items[0].id; 176 | if (override) { 177 | const configJson = getConfig(config); 178 | await client.updateConfig(crawlerId, configJson); 179 | } 180 | return crawlerId; 181 | } 182 | 183 | // If it doesn't exist yet: create it 184 | const crawler = await client.createCrawler(name, getConfig(config)); 185 | return crawler.id; 186 | } 187 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-console */ 2 | import * as core from '@actions/core'; 3 | import * as github from '@actions/github'; 4 | 5 | import { CrawlerApiClient } from './crawler-api-client'; 6 | import { addComment, getCrawlerId } from './helpers'; 7 | 8 | // CREDENTIALS 9 | const CRAWLER_USER_ID = core.getInput('crawler-user-id'); 10 | const CRAWLER_API_KEY = core.getInput('crawler-api-key'); 11 | const CRAWLER_API_BASE_URL = core.getInput('crawler-api-base-url'); 12 | const GITHUB_TOKEN = core.getInput('github-token'); 13 | 14 | // CRAWLER CONFIGURATION 15 | const CRAWLER_NAME = core.getInput('crawler-name'); 16 | const INDEX_NAME = CRAWLER_NAME.replace(/[ /]/g, '-').replace( 17 | /[/~,[\]`&|;$*\\]/g, 18 | '' 19 | ); 20 | const ALGOLIA_APP_ID = core.getInput('algolia-app-id'); 21 | const ALGOLIA_API_KEY = core.getInput('algolia-api-key'); 22 | const SITE_URL = core.getInput('site-url'); 23 | const OVERRIDE_CONFIG = core.getInput('override-config') === 'true'; 24 | 25 | async function run(): Promise { 26 | const crawlerApiBaseUrl = CRAWLER_API_BASE_URL; 27 | const appId = ALGOLIA_APP_ID; 28 | const name = CRAWLER_NAME; 29 | const siteUrl = SITE_URL; 30 | const indexName = INDEX_NAME; 31 | 32 | const client = new CrawlerApiClient({ 33 | crawlerApiBaseUrl, 34 | crawlerUserId: CRAWLER_USER_ID, 35 | crawlerApiKey: CRAWLER_API_KEY, 36 | }); 37 | const octokit = github.getOctokit(GITHUB_TOKEN); 38 | 39 | console.log('---------CRAWLER CONFIG---------'); 40 | console.log('config', JSON.stringify({ name, appId, siteUrl, indexName })); 41 | 42 | let crawlerId: string; 43 | try { 44 | crawlerId = await getCrawlerId( 45 | { 46 | client, 47 | override: OVERRIDE_CONFIG, 48 | name, 49 | }, 50 | { 51 | appId, 52 | apiKey: ALGOLIA_API_KEY, 53 | indexName, 54 | siteUrl, 55 | } 56 | ); 57 | } catch (err) { 58 | core.error(new Error('Can not upsert crawler'), { 59 | title: err instanceof Error ? err.message : '', 60 | }); 61 | core.setFailed('Can not upsert crawler'); 62 | return; 63 | } 64 | 65 | console.log(`---------- Reindexing crawler ${crawlerId} ----------`); 66 | await client.reindex(crawlerId); 67 | 68 | await addComment({ octokit, crawlerApiBaseUrl, crawlerId, appId, name }); 69 | } 70 | 71 | run().catch((error) => { 72 | core.setFailed(error); 73 | }); 74 | -------------------------------------------------------------------------------- /src/types/algoliaSettings.ts: -------------------------------------------------------------------------------- 1 | // Copied from algoliasearchjs client 2 | // Explicitely copied for monaco editor, direct import did not work (but should) 3 | // If you find a solution you can remove this file 4 | 5 | export interface AlgoliaSettings { 6 | /** 7 | * The complete list of attributes that will be used for searching. 8 | */ 9 | searchableAttributes?: string[]; 10 | /** 11 | * @deprecated Use `searchableAttributes` instead. 12 | */ 13 | attributesToIndex?: string[]; 14 | /** 15 | * The complete list of attributes that will be used for faceting. 16 | */ 17 | attributesForFaceting?: string[]; 18 | /** 19 | * List of attributes that cannot be retrieved at query time. 20 | */ 21 | unretrievableAttributes?: string[]; 22 | /** 23 | * Gives control over which attributes to retrieve and which not to retrieve. 24 | */ 25 | attributesToRetrieve?: string[]; 26 | /** 27 | * Controls the way results are sorted. 28 | */ 29 | ranking?: string[]; 30 | /** 31 | * Specifies the custom ranking criterion. 32 | */ 33 | customRanking?: string[]; 34 | /** 35 | * Creates replicas, exact copies of an index. 36 | */ 37 | replicas?: string[]; 38 | /** 39 | * @deprecated Use `replicas` instead. 40 | */ 41 | slaves?: string[]; 42 | /** 43 | * The primary parameter is automatically added to a replica's settings when the replica is created and cannot be modified. 44 | * 45 | * Can not be setted. 46 | */ 47 | primary?: string; 48 | /** 49 | * Maximum number of facet values to return for each facet during a regular search. 50 | */ 51 | maxValuesPerFacet?: number; 52 | /** 53 | * Controls how facet values are sorted. 54 | */ 55 | sortFacetValuesBy?: 'alpha' | 'count'; 56 | /** 57 | * List of attributes to highlight. 58 | */ 59 | attributesToHighlight?: string[]; 60 | /** 61 | * List of attributes to snippet, with an optional maximum number of words to snippet. 62 | */ 63 | attributesToSnippet?: string[]; 64 | /** 65 | * The HTML string to insert before the highlighted parts in all highlight and snippet results. 66 | */ 67 | highlightPreTag?: string; 68 | /** 69 | * The HTML string to insert after the highlighted parts in all highlight and snippet results. 70 | */ 71 | highlightPostTag?: string; 72 | /** 73 | * String used as an ellipsis indicator when a snippet is truncated. 74 | */ 75 | snippetEllipsisText?: string; 76 | /** 77 | * Restrict highlighting and snippeting to items that matched the query. 78 | */ 79 | restrictHighlightAndSnippetArrays?: boolean; 80 | /** 81 | * Set the number of hits per page. 82 | */ 83 | hitsPerPage?: number; 84 | /** 85 | * Set the maximum number of hits accessible via pagination. 86 | */ 87 | paginationLimitedTo?: number; 88 | /** 89 | * Minimum number of characters a word in the query string must contain to accept matches with 1 typo. 90 | */ 91 | minWordSizefor1Typo?: number; 92 | /** 93 | * Minimum number of characters a word in the query string must contain to accept matches with 2 typos. 94 | */ 95 | minWordSizefor2Typos?: number; 96 | /** 97 | * Controls whether typo tolerance is enabled and how it is applied. 98 | */ 99 | typoTolerance?: boolean | string; 100 | /** 101 | * Hether to allow typos on numbers (“numeric tokens”) in the query string. 102 | */ 103 | allowTyposOnNumericTokens?: boolean; 104 | /** 105 | * List of attributes on which you want to disable typo tolerance. 106 | */ 107 | disableTypoToleranceOnAttributes?: string[]; 108 | /** 109 | * List of words on which you want to disable typo tolerance. 110 | */ 111 | disableTypoToleranceOnWords?: string[]; 112 | /** 113 | * Control which separators are indexed. 114 | */ 115 | separatorsToIndex?: string; 116 | /** 117 | * Treats singular, plurals, and other forms of declensions as matching terms. 118 | */ 119 | ignorePlurals?: string[] | boolean; 120 | /** 121 | * Sets the languages to be used by language-specific settings and functionalities such as ignorePlurals, removeStopWords, and CJK word-detection. 122 | */ 123 | queryLanguages?: string[]; 124 | /** 125 | * A list of language ISO code. 126 | */ 127 | indexLanguages?: string[]; 128 | /** 129 | * Whether rules should be globally enabled. 130 | */ 131 | enableRules?: boolean; 132 | /** 133 | * Controls if and how query words are interpreted as prefixes. 134 | */ 135 | queryType?: 'prefixAll' | 'prefixLast' | 'prefixNone'; 136 | /** 137 | * Selects a strategy to remove words from the query when it doesn’t match any hits. 138 | */ 139 | removeWordsIfNoResults?: 'allOptional' | 'firstWords' | 'lastWords' | 'none'; 140 | /** 141 | * Enables the advanced query syntax. 142 | */ 143 | advancedSyntax?: boolean; 144 | /** 145 | * AdvancedSyntaxFeatures can be exactPhrase or excludeWords. 146 | */ 147 | advancedSyntaxFeatures?: Array<'exactPhrase' | 'excludeWords'>; 148 | /** 149 | * A list of words that should be considered as optional when found in the query. 150 | */ 151 | optionalWords?: string[]; 152 | /** 153 | * List of attributes on which you want to disable prefix matching. 154 | */ 155 | disablePrefixOnAttributes?: string[]; 156 | /** 157 | * List of attributes on which you want to disable the exact ranking criterion. 158 | */ 159 | disableExactOnAttributes?: string[]; 160 | /** 161 | * Controls how the exact ranking criterion is computed when the query contains only one word. 162 | */ 163 | exactOnSingleWordQuery?: 'attribute' | 'none' | 'word'; 164 | /** 165 | * List of alternatives that should be considered an exact match by the exact ranking criterion. 166 | */ 167 | alternativesAsExact?: Array< 168 | 'ignorePlurals' | 'multiWordsSynonym' | 'singleWordSynonym' 169 | >; 170 | /** 171 | * Removes stop (common) words from the query before executing it. 172 | */ 173 | removeStopWords?: string[] | boolean; 174 | /** 175 | * List of numeric attributes that can be used as numerical filters. 176 | */ 177 | numericAttributesForFiltering?: string[]; 178 | /** 179 | * Enables compression of large integer arrays. 180 | */ 181 | allowCompressionOfIntegerArray?: boolean; 182 | /** 183 | * Name of the de-duplication attribute to be used with the distinct feature. 184 | */ 185 | attributeForDistinct?: string; 186 | /** 187 | * Enables de-duplication or grouping of results. 188 | */ 189 | distinct?: boolean | number; 190 | /** 191 | * Whether to highlight and snippet the original word that matches the synonym or the synonym itself. 192 | */ 193 | replaceSynonymsInHighlight?: boolean; 194 | /** 195 | * Allows proximity to impact which searchable attribute is matched in the attribute ranking stage. 196 | */ 197 | attributeCriteriaComputedByMinProximity?: boolean; 198 | /** 199 | * Precision of the proximity ranking criterion. 200 | */ 201 | minProximity?: number; 202 | /** 203 | * Choose which fields the response will contain. Applies to search and browse queries. 204 | */ 205 | responseFields?: string[]; 206 | /** 207 | * Maximum number of facet hits to return during a search for facet values. 208 | */ 209 | maxFacetHits?: number; 210 | /** 211 | * List of attributes on which to do a decomposition of camel case words. 212 | */ 213 | camelCaseAttributes?: string[]; 214 | /** 215 | * Specify on which attributes in your index Algolia should apply word-splitting (“decompounding”). 216 | */ 217 | decompoundedAttributes?: Record; 218 | /** 219 | * Characters that should not be automatically normalized by the search engine. 220 | */ 221 | keepDiacriticsOnCharacters?: string; 222 | /** 223 | * Overrides Algolia's default normalization. 224 | */ 225 | customNormalization?: Record>; 226 | /** 227 | * Custom userData that could be added to the Settings. 228 | */ 229 | userData?: any; 230 | } 231 | -------------------------------------------------------------------------------- /src/types/config.ts: -------------------------------------------------------------------------------- 1 | /// 2 | 3 | import type { AlgoliaSettings } from './algoliaSettings'; 4 | import type { FileTypes } from './fileTypes'; 5 | 6 | /** 7 | * Specification of a CrawlerConfig object, i.e. The unserialized UserConfig.config field. 8 | */ 9 | export interface ExternalDataSourceGoogleAnalytics { 10 | dataSourceId: string; 11 | type: 'googleanalytics'; 12 | metrics: string[]; 13 | startDate?: string; 14 | endDate?: string; 15 | samplingLevel?: 'DEFAULT' | 'LARGE' | 'SMALL'; 16 | credentials: { 17 | type: 'service_account'; 18 | client_email: string; 19 | private_key: string; 20 | viewIds?: string[]; 21 | }; 22 | } 23 | 24 | export interface ExternalDataSourceCSV { 25 | dataSourceId: string; 26 | type: 'csv'; 27 | url: string; 28 | } 29 | 30 | export interface ExtractionHelpers { 31 | splitContentIntoRecords: (params?: { 32 | /** 33 | * A [Cheerio instance](https://cheerio.js.org/) that determines from which element(s) textual content will be extracted and turned into records. 34 | * 35 | * @default `$('body')` 36 | */ 37 | $elements?: cheerio.Cheerio; 38 | 39 | /** 40 | * Attributes (and their values) to add to all resulting records. 41 | * 42 | * @default `{}` 43 | */ 44 | baseRecord?: Record; 45 | 46 | /** 47 | * Maximum number of bytes allowed per record, on the resulting Algolia index. 48 | * 49 | * @default `10000` 50 | */ 51 | maxRecordBytes?: number; 52 | 53 | /** 54 | * Name of the attribute in which to store the text of each record. 55 | * 56 | * @default `'text'` 57 | */ 58 | textAttributeName?: string; 59 | 60 | /** 61 | * Name of the attribute in which to store the number of each record. 62 | */ 63 | orderingAttributeName?: string; 64 | }) => Array>; 65 | 66 | docsearch: (params: { 67 | selectors: { 68 | lvl0?: string; 69 | lvl1: string; 70 | lvl2?: string; 71 | lvl3?: string; 72 | lvl4?: string; 73 | lvl5?: string; 74 | lvl6?: string; 75 | content: string; 76 | }; 77 | 78 | /** 79 | * Should we indexHeadings 80 | * - true = yes 81 | * - false = no 82 | * - { from, to } = from lvl to lvl only. 83 | */ 84 | indexHeadings?: 85 | | false 86 | | { 87 | from: number; 88 | to: number; 89 | }; 90 | }) => Array<{ 91 | objectID: string; 92 | [key: string]: any; 93 | }>; 94 | } 95 | 96 | export type RecordExtractor = (params: { 97 | /** A [Cheerio instance](https://cheerio.js.org/) that contains the HTML for the crawled page. */ 98 | $: cheerio.Root; 99 | 100 | /** A [Location object](https://developer.mozilla.org/en-US/docs/Web/API/Location) containing the URL and metadata for the crawled page. */ 101 | url: URL; 102 | 103 | /** The fileType of the crawled page (e.g.: html, pdf, ...). */ 104 | fileType: keyof typeof FileTypes; 105 | 106 | /** The number of bytes in the crawled page. */ 107 | contentLength: number; 108 | 109 | /** Array of external data sources. */ 110 | dataSources: { [dataSourceName: string]: { [key: string]: any } }; 111 | 112 | /** A set of functions to help you extract content. */ 113 | helpers: ExtractionHelpers; 114 | }) => Array<{ 115 | objectID?: string; 116 | [key: string]: any; 117 | }>; 118 | 119 | export interface ExtractorCustom { 120 | type: 'custom'; 121 | params: { 122 | method: RecordExtractor; 123 | }; 124 | } 125 | 126 | export interface Action { 127 | /** Unique name of the action. */ 128 | name?: string; 129 | 130 | indexName: string; 131 | 132 | partialUpdate?: boolean; 133 | 134 | /** How often this specific action will run. 135 | * See root level schedule for more details. 136 | */ 137 | schedule?: string; 138 | 139 | /** Will determine which webpages will match for this action. This list is checked against the url of webpages using [micromatch](https://github.com/micromatch/micromatch). Negation, wildcards and more can be used. Check the full documentation. */ 140 | pathsToMatch?: string[]; 141 | 142 | /** Will check for the presence or absence of DOM nodes. */ 143 | selectorsToMatch?: string[]; 144 | 145 | /** Override if you want to index documents. Chosen file types will be converted to HTML using [Tika](https://wiki.apache.org/tika/TikaJAXRS), then treated as a normal HTML page. See the [documents guide](https://www.algolia.com/doc/tools/crawler/guides/extracting-data/how-to/index-documents/) for a list of available `fileTypes`. */ 146 | fileTypesToMatch?: Array; 147 | 148 | /** Generate an `objectID` for records that don't have one. See the [`objectID` definition](#). Setting this parameter to `false` means we'll raise an error in case an extracted record doesn't have an `objectID`. Note, this parameter is not compatible with `partialUpdate = true`. */ 149 | autoGenerateObjectIDs?: boolean; 150 | 151 | /** An recordExtractor is just a custom Javascript function that let you execute your own code and extract what you want from a page. */ 152 | recordExtractor?: RecordExtractor; 153 | extractors?: ExtractorCustom[]; 154 | } 155 | 156 | /** 157 | * Typed Schema used for autocompletion in the Editor of the Admin Console. 158 | * Note: please keep in sync with crawler-common/src/config/validation. 159 | */ 160 | export interface Config { 161 | /** @required Application ID that specifies which of your Algolia application you want to save your crawler extractions to. */ 162 | appId: string; 163 | 164 | /** 165 | * @required Algolia API key for your targeted Algolia application. Using the Admin API key is not allowed, and it must: 166 | * - Have the following rights: `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse` 167 | * - Have access to the correct set of indexes, according to the `indexPrefix` (e.g. have access to `crawler_*` if the indexPrefix is `crawler_`) 168 | * 169 | * This key will be generated for you by the Admin Console when you create a configuration, if you provide the Admin API Key. We will never store the Admin API Key. 170 | */ 171 | apiKey: string; 172 | 173 | /** 174 | * @default 8 seconds 175 | * 176 | * @required Number of concurrent tasks (per second) that can run for this configuration. Higher means more crawls per second. 177 | * This number works with the following formula: 178 | * ``` 179 | * MAX ( urls_added_in_the_last_second, urls_currently_being_processed ) <= rateLimit 180 | * ``` 181 | * If fetching, processing, uploading is taking less than a second, your crawler processes `rateLimit` urls per second. 182 | * 183 | * However, if each page takes on average 4 secondes to be processed, your crawler processes `rateLimit / 4` pages per second. 184 | * 185 | * It's recommend to start with a low value (e.g. 2) and update it if you need faster crawling: a high `rateLimit` can have a huge impact over bandwidth cost and server resource consumption. 186 | */ 187 | rateLimit: number; 188 | 189 | /** 190 | * How often you want to execute a complete recrawl. Expressed using [Later.js' syntax](https://bunkat.github.io/later/). 191 | * 192 | * If omitted, you will need to manually launch a reindex operation in order to update the crawled records. 193 | * 194 | * Important notes: 195 | * 1. The interval between two scheduled crawls must be equal or higher than 24 hours. 196 | * 2. Times will be interpreted as UTC (GMT+0 timezone). 197 | */ 198 | schedule?: string; 199 | 200 | /** 201 | * When `true`, all web pages are rendered with a chrome headless browser. You get the rendered HTML result. 202 | * 203 | * Because rendering JavaScript-based web pages is much slower than crawling regular HTML pages, you can apply this setting to a specified list of [micromatch](https://github.com/micromatch/micromatch) URL patterns. These patterns can include negations and wildcards. 204 | * 205 | * With this setting enabled, JavaScript is executed on the webpage. Because a lot of websites have infinite refreshes and updates, this Chrome headless browser is configured with a timeout (set to a few seconds). 206 | * 207 | * This can lead to inconsistent records across recrawls, depending on the browser load and the website speed. 208 | * 209 | * Make sure your crawler manages to load the data from JavaScript-based pages interested in fast enough. 210 | */ 211 | renderJavaScript?: string[] | boolean; 212 | 213 | /** Saves a backup of your production index before it is overwritten by the index generated during a recrawl. */ 214 | saveBackup?: boolean; 215 | 216 | /** 217 | * When set to `true`, this tells the Crawler to ignore rules set in the robots.txt. 218 | */ 219 | ignoreRobotsTxtRules?: boolean; 220 | 221 | /** 222 | * Whether the Crawler should extract records from a page whose `robots` meta tag contains `noindex` or `none`. 223 | * 224 | * When `true`, the crawler will ignore the `noindex` directive of the [robots meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names). 225 | * 226 | * Its default value is currently `true`, but it will change to `false` in a near future. If you'd like the crawler to not respect the `noindex` directive, you should set it explicitely. 227 | */ 228 | ignoreNoIndex?: boolean; 229 | 230 | /** 231 | * Whether the Crawler should follow links marked as `nofollow`. 232 | * 233 | * This setting applies to both: 234 | * - links which should be ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow`; 235 | * - links whose [rel attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) contains the `nofollow` directive. 236 | * 237 | * When `true`, the crawler will consider those links as if they weren't marked to be ignored. 238 | * 239 | * The crawler might still ignore links that don't match the patterns of your configuration. 240 | * 241 | * Its default value is currently `true`, but it will change to `false` in a near future. If you'd like the crawler to never respect `nofollow` directives, you should set it explicitely. 242 | * 243 | * Note: The "To" suffix is here for consistency with `ignoreCanonicalTo`. While it only accepts a boolean for now, we plan for it to accept an array of patterns eventually. Please contact us if you need such fine grained control. 244 | */ 245 | ignoreNoFollowTo?: boolean; 246 | 247 | /** 248 | * This tells the Crawler to process a page even if there is a meta canonical URL specified. 249 | * 250 | * When set to `true`, it will ignore all canonical. 251 | * When set to `string[]`, it will ignore canonical that matches the specified patterns. 252 | */ 253 | ignoreCanonicalTo?: string[] | boolean; 254 | 255 | /** 256 | * @required if no `sitemaps` 257 | * 258 | * Your crawler uses these URLs as a starting point for its crawl. 259 | */ 260 | startUrls?: string[]; 261 | 262 | /** 263 | * @required if no `startUrls` 264 | * 265 | * URLs found in `sitemaps` are treated as `startUrls` for your crawler: they are used as start points for the crawl. 266 | */ 267 | sitemaps?: string[]; 268 | 269 | /** 270 | * URLs found in `extraUrls` are treated as `startUrls` for your crawler: they are used as start points for the crawl. 271 | * 272 | * Crawler saves URLs added through the **Add a URL** field of the Admin's Configuration tab to the `extraUrls` array. 273 | * 274 | * Internally `extraUrls` is treated like `startUrls`. The seperate parameter serves to identify which URLs were added directly to the crawler's configuration file vs. Those that were added through the Admin. 275 | */ 276 | extraUrls?: string[]; 277 | 278 | /** 279 | * Determines the webpage patterns ignored or excluded during a crawl. 280 | * 281 | * This list is checked against the url of webpages using [micromatch](https://github.com/micromatch/micromatch). You can use negation, wildcards, and more. 282 | */ 283 | exclusionPatterns?: string[]; 284 | 285 | /** Filters out specified query parameters from crawled urls. Useful for avoiding duplicate crawls of the same page. */ 286 | ignoreQueryParams?: string[]; 287 | 288 | /** Prefix added in front of all indices defined in the crawler's configuration. */ 289 | indexPrefix?: string; 290 | 291 | /** 292 | * Defines the settings for the indices that updated by your crawler. 293 | * 294 | * Index names should be provided as keys. Their values are objects that define Algolia index settings as properties (e.g. `searchableAttributes` `attributesForFaceting`). 295 | * 296 | * Index settings will only be applied on your Algolia's index during the first run (or if the index doesn't exist when launching the reindex). Once an index has been created, settings are never re-applied: this prevents to not override any manual changes you may have done. 297 | */ 298 | initialIndexSettings?: { 299 | [indexName: string]: AlgoliaSettings; 300 | }; 301 | 302 | /** 303 | * Limits the number of URLs your crawler processes. 304 | * 305 | * Useful for demoing and preventing infinite link holes in the website structure. 306 | * 307 | * `maxUrls` does not guarantee consistent indexing accross recrawls. Because of parallel processing, discovered URLs can be processed in different orders for different recrawls. 308 | * 309 | * This parameter is capped at a maximum of `1,000,000`. 310 | */ 311 | maxUrls?: number; 312 | 313 | /** 314 | * Limits the processing of URLs to a specified depth, inclusively. 315 | * 316 | *_Maximum_: `100`. 317 | * 318 | * URLs added manually (startUrls, sitemaps...) are not checked against this limit. 319 | * 320 | * **How we calculate depth:**. 321 | * 322 | * @example 323 | * ```javascript 324 | * http://example.com => 1 325 | * http://example.com/ => 1 326 | * http://example.com/foo => 1 327 | * http://example.com/foo/ => 2 328 | * http://example.com/foo/bar => 2 329 | * http://example.com/foo/bar/ => 3 330 | * ... 331 | * ``` 332 | */ 333 | maxDepth?: number; 334 | 335 | /** 336 | * Defines which webpages will be visited. 337 | * It is used in combination with the `pathsToMatchs` of your actions. 338 | * The Crawler will visit all links that match at least one of those paths. 339 | */ 340 | discoveryPatterns?: string[]; 341 | 342 | /** 343 | * Defines a hostname key that will be transformed as the value specified. 344 | * The keys are exact match only. 345 | * 346 | * Applied to: 347 | * - All URLs found 348 | * - Canonical 349 | * - Redirection. 350 | * 351 | * Not applied to: 352 | * - props: startUrls, extraUrls, pathsToMatch, etc... 353 | * - URLs in your code. 354 | * 355 | * @example 356 | * ```javascript 357 | * hostnameAliases: { 358 | * 'algolia.com': 'dev.algolia.com' 359 | * } 360 | * ``` 361 | */ 362 | hostnameAliases?: Record; 363 | 364 | pathAliases?: Record>; 365 | 366 | /** 367 | * Determines the function used to extract URLs from pages. 368 | * 369 | * If provided, this function is called on a crawled page. Only the URLs it returns are enqueued for further crawling. By default, all the URLs found while crawling a page are enqueued given that they comply with `pathsToMatch`, `fileTypesToMatch` and `exclusions`. 370 | * 371 | * Expected return value: `array` of `strings` (URLs). 372 | */ 373 | linkExtractor?: (params: { 374 | $: cheerio.Root; 375 | url: URL; 376 | defaultExtractor: () => string[]; 377 | }) => string[]; 378 | 379 | /** 380 | * Modify all requests behavior. 381 | * 382 | * Cookie Header will be overriden by the cookie fetched in `login`. 383 | */ 384 | requestOptions?: { 385 | proxy?: string; 386 | timeout?: number; 387 | retries?: number; 388 | headers?: { 389 | 'Accept-Language'?: string; 390 | Authorization?: string; 391 | Cookie?: string; 392 | }; 393 | }; 394 | 395 | /** 396 | * This property can be set in order to define how the Crawler should login to the website before crawling pages. 397 | * 398 | * The Crawler will then extract the `Set-Cookie` response header from the login page and send that Cookie when crawling all pages of the website defined in the configuration. 399 | */ 400 | login?: { 401 | fetchRequest?: { 402 | url: string; 403 | requestOptions?: { 404 | method?: string; 405 | headers?: { 406 | 'Content-Type'?: string; 407 | Cookie?: string; 408 | Authorization?: string; 409 | }; 410 | body?: string; 411 | timeout?: number; 412 | }; 413 | }; 414 | browserRequest?: { 415 | url: string; 416 | username: string; 417 | password: string; 418 | }; 419 | }; 420 | 421 | cache?: { 422 | enabled: boolean; 423 | }; 424 | 425 | /** 426 | * Defines external data sources you want to retrieve during every recrawl and made available to your extractors. 427 | * 428 | * **There are two supported data sources: Google Analytics and CSV files.**. 429 | * 430 | * Once you setup an `externalDataSource`, it is exposed your [`extractors`]. 431 | * You can have maximum 10 sources. 11 millions URLs accross all sources. 432 | * You can access it through the `dataSources` object, which has the following structure. 433 | * 434 | * @example 435 | * ```javascript 436 | * { 437 | * dataSourceId1: { data1: 'val1', data2: 'val2' }, 438 | * dataSourceId2: { data1: 'val1', data2: 'val2' }, 439 | * } 440 | * ``` 441 | */ 442 | externalDataSources?: Array< 443 | ExternalDataSourceCSV | ExternalDataSourceGoogleAnalytics 444 | >; 445 | 446 | /** 447 | * Determines which web pages are translated into Algolia records and in what way. 448 | * 449 | * A single action defines: 450 | * 1. The subset of your crawler's websites it targets, 451 | * 2. The extraction process for those websites, 452 | * 3. And the index(es) to which the extracted records are pushed. 453 | * 454 | * A single web page can match multiple actions. In this case, your crawler creates a record for each matched actions. 455 | */ 456 | actions: Action[]; 457 | 458 | /** 459 | * A configurable collection of safety checks to make sure the crawl was successful. 460 | * 461 | * This configuration describes all the checks the Crawler can perform to ensure data is correct. 462 | * For example, the number of records from one crawl to another. 463 | */ 464 | safetyChecks?: { 465 | /** 466 | * Checks triggered after the Crawler is done, and before the records 467 | * are pushed to Algolia into the final index. 468 | */ 469 | beforeIndexPublishing?: { 470 | /** 471 | * Defines the limit of records difference between the new and the last crawl as a percentage of total records (inclusive). 472 | * 473 | * _Default_: `10`. 474 | * 475 | * _Minimum_: `0`\ 476 | * _Maximum_: `100`. 477 | * 478 | * If the new number of records is less than `last number of records * (1 - maxLostRecordsPercentage / 100)`, 479 | * the process throws a `SafeReindexingError`, blocking the Crawler until manual restart. 480 | */ 481 | maxLostRecordsPercentage?: number; 482 | }; 483 | }; 484 | } 485 | 486 | export default Config; 487 | -------------------------------------------------------------------------------- /src/types/configJson.ts: -------------------------------------------------------------------------------- 1 | import type { Config, Action, ExtractorCustom } from './config'; 2 | import type { Modify } from './utils'; 3 | 4 | export type FunctionAsString = { 5 | __type: 'function'; 6 | source: string; 7 | }; 8 | 9 | export type ExtractorCustomAsString = Modify< 10 | ExtractorCustom, 11 | { 12 | params: { 13 | method: FunctionAsString; 14 | }; 15 | } 16 | >; 17 | 18 | export type ActionAsString = Modify< 19 | Action, 20 | { 21 | recordExtractor?: FunctionAsString; 22 | extractors?: ExtractorCustomAsString[]; 23 | } 24 | >; 25 | 26 | export type ConfigJson = Modify< 27 | Config, 28 | { 29 | linkExtractor?: FunctionAsString; 30 | actions: ActionAsString[]; 31 | } 32 | >; 33 | -------------------------------------------------------------------------------- /src/types/fileTypes.ts: -------------------------------------------------------------------------------- 1 | export enum FileTypes { 2 | 'html' = 'html', 3 | 'xml' = 'xml', 4 | 'pdf' = 'pdf', 5 | 'doc' = 'doc', 6 | 'xls' = 'xls', 7 | 'ppt' = 'ppt', 8 | 'odt' = 'odt', 9 | 'ods' = 'ods', 10 | 'odp' = 'odp', 11 | } 12 | 13 | export type FileType = keyof typeof FileTypes; 14 | -------------------------------------------------------------------------------- /src/types/github.ts: -------------------------------------------------------------------------------- 1 | export interface GithubComment { 2 | id: number; 3 | body?: string; 4 | user: { 5 | login: string; 6 | } | null; 7 | } 8 | -------------------------------------------------------------------------------- /src/types/publicApiJsonResponses.ts: -------------------------------------------------------------------------------- 1 | import type { ConfigJson } from './configJson'; 2 | import type { Optional } from './utils'; 3 | 4 | enum JobStatusEnum { 5 | DONE = 'DONE', 6 | SKIPPED = 'SKIPPED', 7 | FAILED = 'FAILED', 8 | PENDING = 'PENDING', 9 | } 10 | 11 | export interface UserConfigReindexSummaryGroup { 12 | reason: string; 13 | status: keyof typeof JobStatusEnum; 14 | category?: string; 15 | readable?: string; 16 | nbUrls: number; 17 | previousNbUrls?: number; 18 | } 19 | 20 | export interface GetCrawlersResponseBody { 21 | items: Array<{ id: string; name: string }>; 22 | itemsPerPage: number; 23 | page: number; 24 | total: number; 25 | } 26 | 27 | export interface CreatedCrawlerResponseBody { 28 | id: string; 29 | } 30 | 31 | export interface UpdateConfigResponseBody { 32 | rateLimit: number; 33 | startUrls: string[]; 34 | } 35 | 36 | export interface CrawlerStatusResponseBody { 37 | name: string; 38 | createdAt: string; 39 | updatedAt: string; 40 | running: boolean; 41 | reindexing: boolean; 42 | blocked: boolean; 43 | blockingError?: string; 44 | blockingTaskId?: string; 45 | lastReindexStartedAt: string | null; 46 | lastReindexEndedAt: string | null; 47 | config?: ConfigJson; 48 | } 49 | 50 | export interface GetUrlStatsResponseBody { 51 | count: number; 52 | data: UserConfigReindexSummaryGroup[]; 53 | } 54 | 55 | export interface TaskResponseBody { 56 | taskId: string; 57 | } 58 | 59 | export interface AlgoliaRecord { 60 | objectID: string; 61 | [key: string]: any; 62 | } 63 | 64 | export interface RecordsPerExtractor { 65 | index: number; 66 | type: 'algoliaCache' | 'custom'; 67 | records: Array>; 68 | } 69 | 70 | export interface ExtractedRecord { 71 | actionName: string; 72 | indexName: string; 73 | partialUpdate: boolean; 74 | records: AlgoliaRecord[]; 75 | recordsPerExtractor: RecordsPerExtractor[]; 76 | } 77 | 78 | export type UrlTesterRecord = Pick< 79 | ExtractedRecord, 80 | 'indexName' | 'records' | 'recordsPerExtractor' 81 | >; 82 | 83 | export interface ExternalDataOneUrl { 84 | url: string; 85 | dataSources: { [key: string]: any }; 86 | } 87 | 88 | export interface LoginResponse { 89 | statusCode: number; 90 | cookie: string | null; 91 | httpHeaders: Headers; 92 | error?: string; 93 | } 94 | 95 | export interface UrlTestResponseBody { 96 | startDate: string; 97 | endDate: string; 98 | logs: string[][]; 99 | records: UrlTesterRecord[]; 100 | links: string[]; 101 | externalData?: ExternalDataOneUrl['dataSources']; 102 | error?: { code?: string; message: string; details?: any }; 103 | loginResponse?: LoginResponse; 104 | } 105 | -------------------------------------------------------------------------------- /src/types/utils.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/naming-convention */ 2 | 3 | /** 4 | * Take an interface and list the keys that are optional. 5 | * 6 | * @example 7 | * interface Hello { 8 | * foo?: string; 9 | * bar?: string; 10 | * baz: string; 11 | * } 12 | * 13 | * OptionalKeys; 14 | * 15 | * Will result in: 16 | * 'foo' | 'bar' 17 | */ 18 | export type OptionalKeys = { 19 | [K in keyof T]: undefined extends T[K] ? K : never; 20 | }[keyof T]; 21 | 22 | /** 23 | * Take an interface and choose what property should undefined. 24 | * 25 | * @example 26 | * interface Hello { 27 | * foo: string; 28 | * bar: string; 29 | * baz?: string; 30 | * }; 31 | * 32 | * Optional; 33 | * 34 | * Will results in: 35 | * { 36 | * foo: string; 37 | * bar?: string; 38 | * baz?: string; 39 | * } 40 | * 41 | */ 42 | export type Optional = { 43 | [P in Exclude>>]?: T[P]; 44 | } & { 45 | [P in Exclude>]: T[P]; 46 | }; 47 | 48 | /** 49 | * Take an interface and replace specified property. (By default Typescript merge but do not replace). 50 | * 51 | * @example 52 | * interface Hello { 53 | * foo: string; 54 | * bar: string; 55 | * }; 56 | * 57 | * Modify; 58 | * 59 | * Will results in: 60 | * { 61 | * foo: string; 62 | * bar: number; 63 | * } 64 | */ 65 | export type Modify = Omit & R; 66 | 67 | /** 68 | * Take an interface and choose what property should not be undefined. 69 | * 70 | * @example 71 | * interface Hello { 72 | * foo?: string; 73 | * bar?: string; 74 | * }; 75 | * 76 | * RequireSome; 77 | * 78 | * Will results in: 79 | * { 80 | * foo?: string; 81 | * bar: string; 82 | * } 83 | */ 84 | export type RequireSome = Omit & { 85 | [P in Exclude>]: Exclude; 86 | }; 87 | 88 | // stackoverflow.com/questions/49285864/is-there-a-valueof-similar-to-keyof-in-typescript 89 | /** 90 | * Get values of type interface. 91 | * 92 | * @example 93 | * interface Foo { foo: string } 94 | * ValueOf 95 | * => string. 96 | */ 97 | export type ValueOf = T[keyof T]; 98 | 99 | /** 100 | * Get values of array. 101 | * 102 | * @example 103 | * const arr = [ 'foobar' ]; 104 | * type ArrType = ValuesOfArray; 105 | */ 106 | export type ValuesOfArray = T[number]; 107 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "ES2020", 4 | "esModuleInterop": true, 5 | "target": "ES2020", 6 | "baseUrl": "./", 7 | "rootDir": "./src", 8 | "outDir": "./dist", 9 | "moduleResolution": "node", 10 | "downlevelIteration": true, 11 | "incremental": false, 12 | "sourceMap": false, 13 | "removeComments": true, 14 | "allowJs": true, 15 | "noImplicitUseStrict": true, 16 | "resolveJsonModule": true, 17 | "declaration": true, 18 | "declarationMap": true, 19 | "composite": false, 20 | "preserveConstEnums": true, 21 | "noEmitOnError": false, 22 | "strict": true, 23 | "noImplicitAny": true, 24 | "strictNullChecks": true, 25 | "noImplicitThis": true, 26 | "strictFunctionTypes": true, 27 | "strictPropertyInitialization": true, 28 | "alwaysStrict": false, 29 | "noUnusedLocals": true, 30 | "noUnusedParameters": false, 31 | "noImplicitReturns": true, 32 | "noFallthroughCasesInSwitch": true, 33 | "experimentalDecorators": true, 34 | "emitDecoratorMetadata": true, 35 | "skipLibCheck": true, 36 | "types": [ 37 | "node", 38 | "jest" 39 | ], 40 | }, 41 | "exclude": [ 42 | "node_modules", 43 | "**/*.test.ts" 44 | ], 45 | "include": [ 46 | "src/**/*", 47 | "package.json" 48 | ], 49 | "typeRoots": [ 50 | "node_modules/@types" 51 | ] 52 | } 53 | --------------------------------------------------------------------------------