├── .eslintignore ├── .eslintrc.cjs ├── .github ├── scripts │ └── scrapix_server_call_check.sh └── workflows │ ├── publish-docker-image.yml │ └── tests.yml ├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── config ├── nodemon:build.json ├── nodemon:default-scrap.json └── nodemon:docsearch-scrap.json ├── docker-compose.dev.yml ├── misc ├── Scrapix.postman_collection.json └── config_examples │ ├── default-bigger_batches.json │ ├── default-exclude_urls.json │ ├── default-simple.json │ ├── docsearch-simple.json │ ├── docusaurus-default.json │ ├── docusaurus-docsearch.json │ ├── openai-docsearch-strat.json │ ├── schema-config.json │ ├── schema-indexed_urls.json │ └── schema-simple.json ├── package.json ├── playground └── docusaurus │ ├── .gitignore │ ├── .stackblitzrc │ ├── README.md │ ├── babel.config.js │ ├── blog │ ├── 2019-05-28-first-blog-post.md │ ├── 2019-05-29-long-blog-post.md │ ├── 2021-08-01-mdx-blog-post.mdx │ ├── 2021-08-26-welcome │ │ ├── docusaurus-plushie-banner.jpeg │ │ └── index.md │ └── authors.yml │ ├── docs │ ├── intro.md │ ├── tutorial-basics │ │ ├── _category_.json │ │ ├── congratulations.md │ │ ├── create-a-blog-post.md │ │ ├── create-a-document.md │ │ ├── create-a-page.md │ │ ├── deploy-your-site.md │ │ └── markdown-features.mdx │ └── tutorial-extras │ │ ├── _category_.json │ │ ├── img │ │ ├── docsVersionDropdown.png │ │ └── localeDropdown.png │ │ ├── manage-docs-versions.md │ │ └── translate-your-site.md │ ├── docusaurus.config.js │ ├── package.json │ ├── sandbox.config.json │ ├── scrapix-config.json │ ├── sidebars.js │ ├── src │ ├── components │ │ └── HomepageFeatures │ │ │ ├── index.tsx │ │ │ └── styles.module.css │ ├── css │ │ └── custom.css │ ├── pages │ │ ├── DefaultSearchBar.tsx │ │ ├── index.module.css │ │ ├── index.tsx │ │ └── markdown-page.md │ └── theme │ │ └── SearchBar │ │ └── index.tsx │ ├── static │ ├── .nojekyll │ └── img │ │ ├── docusaurus-social-card.jpg │ │ ├── docusaurus.png │ │ ├── favicon.ico │ │ ├── logo.svg │ │ ├── undraw_docusaurus_mountain.svg │ │ ├── undraw_docusaurus_react.svg │ │ └── undraw_docusaurus_tree.svg │ ├── tsconfig.json │ └── yarn.lock ├── src ├── bin │ └── index.ts ├── crawler.ts ├── crawler_process.ts ├── index.ts ├── meilisearch_client.ts ├── package_version.ts ├── scrapers │ ├── default.ts │ ├── docssearch.ts │ └── schema.ts ├── sender.ts ├── server.ts ├── taskQueue.ts ├── types.ts └── webhook.ts ├── tsconfig.eslint.json ├── tsconfig.json └── yarn.lock /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | examples 4 | scripts 5 | tests/env 6 | coverage 7 | playground 8 | -------------------------------------------------------------------------------- /.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | browser: true, 4 | es6: true, 5 | es2020: true, 6 | 'jest/globals': true, 7 | node: true, 8 | jasmine: true, 9 | }, 10 | extends: [ 11 | 'eslint:recommended', 12 | 'plugin:@typescript-eslint/recommended', 13 | 'plugin:@typescript-eslint/recommended-requiring-type-checking', 14 | 'plugin:prettier/recommended', 15 | ], 16 | parser: '@typescript-eslint/parser', 17 | parserOptions: { 18 | ecmaVersion: 2019, 19 | project: ['tsconfig.eslint.json'], 20 | sourceType: 'module', 21 | projectFolderIgnoreList: ['dist'], 22 | }, 23 | plugins: ['@typescript-eslint', 'prettier', 'jest'], 24 | rules: { 25 | 'no-dupe-class-members': 'off', // Off due to conflict with typescript overload functions 26 | 'prettier/prettier': [ 27 | 'error', 28 | { 29 | singleQuote: true, 30 | arrowParens: 'always', 31 | semi: false, 32 | bracketSpacing: true, 33 | trailingComma: 'es5', 34 | tsdoc: true, 35 | printWidth: 80, 36 | }, 37 | ], 38 | '@typescript-eslint/array-type': ['warn', { default: 'array-simple' }], 39 | '@typescript-eslint/return-await': 'off', 40 | '@typescript-eslint/no-explicit-any': 'off', 41 | '@typescript-eslint/explicit-function-return-type': 'off', 42 | '@typescript-eslint/member-delimiter-style': [ 43 | 'error', 44 | { 45 | multiline: { 46 | delimiter: 'none', // 'none' or 'semi' or 'comma' 47 | requireLast: true, 48 | }, 49 | singleline: { 50 | delimiter: 'semi', // 'semi' or 'comma' 51 | requireLast: false, 52 | }, 53 | }, 54 | ], 55 | 'comma-dangle': 'off', 56 | '@typescript-eslint/ban-ts-ignore': 'off', 57 | '@typescript-eslint/no-misused-promises': ['off'], 58 | '@typescript-eslint/no-unsafe-member-access': ['off'], 59 | '@typescript-eslint/no-unsafe-argument': 'off', 60 | }, 61 | } 62 | -------------------------------------------------------------------------------- /.github/scripts/scrapix_server_call_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | async_url="http://localhost:8080/crawl/async" 4 | payload='{ 5 | "start_urls": [ 6 | "http://localhost:3000" 7 | ], 8 | "meilisearch_url": "http://localhost:7700", 9 | "meilisearch_api_key": "masterKey", 10 | "meilisearch_index_uid": "docusaurus-docsearch", 11 | "strategy": "docssearch" 12 | }' 13 | 14 | echo "Async crawling test" 15 | response=$(curl -X POST -H "Content-Type: application/json" -d "$payload" "$async_url") 16 | 17 | # Check if the response equals "Crawling start" 18 | if [ "$response" = "Crawling started" ]; then 19 | echo "Async crawling started successfully!" 20 | else 21 | echo "Async Crawling failed or returned an unexpected response." 22 | echo $response 23 | exit 1 24 | fi 25 | 26 | sync_url="http://localhost:8080/crawl/sync" 27 | 28 | echo "Sync crawling test" 29 | response=$(curl -X POST -H "Content-Type: application/json" -d "$payload" "$sync_url") 30 | 31 | # Check if the response equals "Crawling finished" 32 | if [ "$response" = "Crawling finished" ]; then 33 | echo "Sync crawling finished successfully!" 34 | else 35 | echo "Sync crawling failed or returned an unexpected response." 36 | echo $response 37 | exit 1 38 | fi 39 | 40 | exit 0 41 | -------------------------------------------------------------------------------- /.github/workflows/publish-docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Publish image to DockerHub 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Docker Buildx 14 | uses: docker/setup-buildx-action@v2 15 | 16 | - name: Set up QEMU 17 | uses: docker/setup-qemu-action@v2 18 | 19 | - name: Login to Docker Hub 20 | uses: docker/login-action@v2 21 | with: 22 | username: ${{ secrets.DOCKER_USERNAME }} 23 | password: ${{ secrets.DOCKER_PASSWORD }} 24 | 25 | - name: Build and push 26 | uses: docker/build-push-action@v4 27 | with: 28 | push: true 29 | platforms: linux/amd64 30 | tags: getmeili/scrapix:latest,getmeili/scrapix:${{ github.ref_name }} 31 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - trying 8 | - staging 9 | - main 10 | 11 | jobs: 12 | lint_tests: 13 | runs-on: ubuntu-latest 14 | name: lint 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Setup node 18 | uses: actions/setup-node@v4 19 | with: 20 | node-version: 20 21 | cache: "yarn" 22 | cache-dependency-path: yarn.lock 23 | - name: Restore cache 24 | uses: actions/cache@v3 25 | with: 26 | path: node_modules 27 | key: yarn-${{ runner.os }}-${{ hashFiles('yarn.lock') }} 28 | restore-keys: | 29 | yarn-${{ runner.os }}- 30 | - name: Install dependencies 31 | run: yarn --frozen-lockfile 32 | - name: Run JS/TS linter 33 | run: yarn lint 34 | 35 | build_test: 36 | runs-on: ubuntu-latest 37 | name: build 38 | steps: 39 | - uses: actions/checkout@v3 40 | - name: Setup node 41 | uses: actions/setup-node@v4 42 | with: 43 | node-version: 20 44 | cache: "yarn" 45 | cache-dependency-path: yarn.lock 46 | - name: Restore cache 47 | uses: actions/cache@v3 48 | with: 49 | path: node_modules 50 | key: yarn-${{ runner.os }}-${{ hashFiles('yarn.lock') }} 51 | restore-keys: | 52 | yarn-${{ runner.os }}- 53 | - name: Install dependencies 54 | run: yarn --frozen-lockfile 55 | - name: Cache build artifacts 56 | uses: actions/cache@v3 57 | with: 58 | path: dist # Ou outro diretório de build 59 | key: build-${{ runner.os }}-${{ github.sha }} 60 | restore-keys: | 61 | build-${{ runner.os }}- 62 | - name: Build project 63 | run: yarn build 64 | 65 | scrap_tests: 66 | runs-on: ubuntu-latest 67 | name: Crawler tests 68 | services: 69 | meilisearch: 70 | image: getmeili/meilisearch:latest 71 | env: 72 | MEILI_MASTER_KEY: "masterKey" 73 | MEILI_NO_ANALYTICS: "true" 74 | ports: 75 | - "7700:7700" 76 | strategy: 77 | fail-fast: false 78 | matrix: 79 | node: ["20"] 80 | steps: 81 | - uses: actions/checkout@v3 82 | - name: Setup node 83 | uses: actions/setup-node@v4 84 | with: 85 | node-version: ${{ matrix.node }} 86 | cache: "yarn" 87 | cache-dependency-path: yarn.lock 88 | - name: Restore cache 89 | uses: actions/cache@v3 90 | with: 91 | path: node_modules 92 | key: yarn-${{ runner.os }}-${{ hashFiles('yarn.lock') }} 93 | restore-keys: | 94 | yarn-${{ runner.os }}- 95 | - name: Install dependencies 96 | run: yarn --frozen-lockfile 97 | - name: Restore build cache 98 | uses: actions/cache@v3 99 | with: 100 | path: dist 101 | key: build-${{ runner.os }}-${{ github.sha }} 102 | restore-keys: | 103 | build-${{ runner.os }}- 104 | - name: Run playground 105 | run: yarn playground:start & 106 | - name: Run default strategy scraper 107 | run: yarn start -p misc/config_examples/docusaurus-default.json 108 | - name: Run docsearch strategy scraper 109 | run: yarn start -p misc/config_examples/docusaurus-docsearch.json 110 | - name: Run scrapix server 111 | run: yarn serve & 112 | - name: Wait 113 | run: sleep 5 114 | - name: Call scrapix server 115 | run: sh .github/scripts/scrapix_server_call_check.sh 116 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | # misc 107 | .DS_Store 108 | 109 | # parcel 110 | .parcel_cache/ 111 | 112 | ############################ 113 | # CYPRESS 114 | ############################ 115 | cypress/screenshots 116 | cypress/videos 117 | cypress/support 118 | cypress/plugins 119 | cypress/fixtures 120 | 121 | ############################ 122 | # MISC 123 | ############################ 124 | 125 | .DS_Store 126 | dist 127 | package 128 | .vscode 129 | .idea 130 | dist_default_export_in_index 131 | no_default_export_in_index 132 | storage 133 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Run linting tests 2 | 3 | ```sh 4 | yarn lint # to test 5 | yarn lint:fix # to fix errors 6 | ``` 7 | 8 | # Running in dev mode 9 | 10 | ```sh 11 | # Run the build in watch mode 12 | yarn dev:build 13 | ``` 14 | 15 | Running this mode rebuilds scrapix on every change made in the source files (`./src`) 16 | 17 | # Running the playground 18 | 19 | ```sh 20 | yarn playground:docsearch 21 | yarn playground:default 22 | ``` 23 | 24 | Running this mode has two effects. 25 | 26 | - If you change the source code of the docusaurus playground, the docusaurus app restarts. 27 | - If you change the source code of scrapix, scrapix is rebuilded and re-runs a scrapper (either the default one or the docsearch one) on the docusaurus app. 28 | 29 | # Re-scrap a chosen app on change 30 | 31 | If you which to re-scrap an app based on a custom scrapix configuration file, run the following: 32 | 33 | ```sh 34 | npx nodemon --watch src --watch "[PATH_TO_YOUR_CONFIG_FILE]" --ext ts,json --exec "yarn start -c [PATH_TO_YOUR_CONFIG_FILE]" 35 | ``` 36 | 37 | This will scrap an app based on the provided "PATH_TO_YOUR_CONFIG_FILE" on every change in the `./src` folder and on your provided config file. 38 | 39 | ## Publish 40 | 41 | To publish scrapix to npm. Increase its version and then: 42 | 43 | ```sh 44 | yarn build 45 | npm publish . 46 | ``` 47 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node-puppeteer-chrome:20-21.4.0 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY --chown=myuser package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN yarn install --production=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY --chown=myuser . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN yarn run build 20 | 21 | # Create final image 22 | FROM apify/actor-node-puppeteer-chrome:20-21.4.0 23 | 24 | # Copy only built JS files from builder image 25 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist 26 | 27 | # Copy just package.json and package-lock.json 28 | # to speed up the build using Docker layer cache. 29 | COPY --chown=myuser package*.json ./ 30 | 31 | # Install NPM packages, skip optional and development dependencies to 32 | # keep the image small. Avoid logging too much and print the dependency 33 | # tree for debugging 34 | RUN yarn install --production=false 35 | 36 | # Next, copy the remaining files and directories with the source code. 37 | # Since we do this after NPM install, quick build will be really fast 38 | # for most source file changes. 39 | COPY --chown=myuser . ./ 40 | 41 | # Run the image. If you know you won't need headful browsers, 42 | # you can remove the XVFB start script for a micro perf gain. 43 | CMD ./start_xvfb_and_run_cmd.sh && yarn start:prod -- -c $CRAWLER_CONFIG -b /usr/bin/google-chrome --silent 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2025 Meili SAS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapix 2 | 3 | This project is an API that will allow you to scrape any website and send the data to Meilisearch. 4 | 5 | This server has only one endpoint. 6 | 7 | ## Bin usage 8 | 9 | Scrapix provides a CLI to start the crawling process. 10 | 11 | ``` 12 | Usage: yarn start [options] 13 | 14 | Options: 15 | -c, --config JSON string with the scrapix configuration 16 | -p, --config-path Path to the scrapix configuration JSON file 17 | -b, --browser-path Path to the browser binary 18 | ``` 19 | 20 | ## Endpoint 21 | 22 | ### POST /crawl 23 | 24 | This endpoint will crawl the website and send the data to Meilisearch. 25 | data: 26 | 27 | ```json 28 | { 29 | "start_urls": ["https://www.google.com"], 30 | "urls_to_exclude": ["https://www.google.com"], 31 | "urls_to_index": ["https://www.google.com"], 32 | "urls_to_not_index": ["https://www.google.com"], 33 | "meilisearch_url": "http://localhost:7700", 34 | "meilisearch_api_key": "masterKey", 35 | "meilisearch_index_uid": "google", 36 | "strategy": "default", // docssearch, schema*, custom or default 37 | "headless": true, // Use headless browser for rendering javascript websites 38 | "batch_size": 1000, // pass null to send documents 1 at a time or specify a batch size 39 | "primary_key": null, 40 | "meilisearch_settings": { 41 | "searchableAttributes": [ 42 | "h1", 43 | "h2", 44 | "h3", 45 | "h4", 46 | "h5", 47 | "h6", 48 | "p", 49 | "title", 50 | "meta.description" 51 | ], 52 | "filterableAttributes": ["urls_tags"], 53 | "distinctAttribute": "url" 54 | }, 55 | "schema_settings": { 56 | "only_type": "Product", // Product, Article, etc... 57 | "convert_dates": true // default false 58 | } 59 | } 60 | ``` 61 | 62 | ## Process 63 | 64 | ### 1. Add it to the queue 65 | 66 | While the server receives a crawling request it will add it to the queue. When the data is added to the queue, it will return a response to the user. 67 | The queue is handled by redis ([Bull](https://github.com/OptimalBits/bull)). 68 | The queue will dispatch the job to the worker. 69 | 70 | ### 2. Scrape the website 71 | 72 | #### 2.1. Default strategy 73 | 74 | The worker will crawl only pages with the same domain names as those specified in the `start_urls` config option. It will not try to scrape the external links or files. It will also not try to scrape paginated pages (like `/page/1`). 75 | For each scrappable page it will scrape the data by trying to create blocks of titles and text. Each block will contain: 76 | 77 | - h1: The title of the block 78 | - h2: The sub title of the block 79 | - h3...h6: The sub sub title of the block 80 | - p: The text of the block (will create an array of text if there is multiple p in the block) 81 | - page_block: The block number of the page (staring at 0) 82 | - title: The title of the page present in the head tag 83 | - uid: a generated and incremental uid for the block 84 | - url: The url of the page 85 | - anchor: The anchor of the block (the lower title id of the block) 86 | - meta: The meta of the page present in the head tag (json object containing the desciption, keywords, author, twitter, og, etc...) 87 | - url_tags: the url pathname split by / (array of string). The last element has been removed because it's the page name. 88 | 89 | #### 2.2. Docsearch strategy 90 | 91 | The worker will crawl only pages with the same domain names as those specified in the `start_urls` config option. It will not try to scrape the external links or files. It will also not try to scrape when pages are paginated pages (like `/page/1`). 92 | For each scrappable page it will scrape the data by trying to create blocks of titles and text. Each block will contain: 93 | 94 | - uid: a generated and incremental uid for the block 95 | - hierarchy_lvl0: the url pathname split by / (array of string). The last element has been removed because it's the page name. 96 | - hierarchy_lvl1: the h1 of the block 97 | - hierarchy_lvl2: the h2 of the block 98 | - hierarchy_lvl3: the h3 of the block 99 | - hierarchy_lvl4: the h4 of the block 100 | - hierarchy_lvl5: the h5 of the block 101 | - hierarchy_radio_lvl0: same as hierarchy_lvl0 102 | - hierarchy_radio_lvl1: same as hierarchy_lvl1 103 | - hierarchy_radio_lvl2: same as hierarchy_lvl2 104 | - hierarchy_radio_lvl3: same as hierarchy_lvl3 105 | - hierarchy_radio_lvl4: same as hierarchy_lvl4 106 | - hierarchy_radio_lvl5: same as hierarchy_lvl5 107 | - content: The text of the block (will create an array of text if there is multiple p in the block) 108 | - url: The url of the page with the anchor 109 | - anchor: The anchor of the block (the lower title id of the block) 110 | 111 | ### 3. Send the data to Meilisearch 112 | 113 | While the worker is scraping the website it will send the data to Meilisearch by batch. 114 | Before sending the data to Meilisearch, it will create a new index called `{index_uid}_crawler_tmp`, apply the settings and add the data to it. Then it will use the index swap method to replace the old index by the new one. It will finish properly by deleting the tmp index. 115 | 116 | The setting applied: 117 | 118 | ```json 119 | { 120 | "searchableAttributes": [ 121 | "h1", 122 | "h2", 123 | "h3", 124 | "h4", 125 | "h5", 126 | "h6", 127 | "p", 128 | "title", 129 | "meta.description" 130 | ], 131 | "filterableAttributes": ["urls_tags"], 132 | "distinctAttribute": "url" 133 | } 134 | ``` 135 | 136 | ## Configuration file 137 | 138 | `start_urls` _mandatory_ 139 | 140 | This array contains the list of URLs that will be used to start scraping your website. 141 | The scraper will recursively follow any links ( tags) from those pages. It will not follow links that are on another domain. 142 | 143 | `urls_to_exclude` 144 | List of the URL's to ignore 145 | 146 | `urls_to_not_index` 147 | List of the URLS that should not be indexed 148 | 149 | `meilisearch_url` _mandatory_ 150 | The URL to your Meilisearch instance 151 | 152 | `meilisearch_api_key` 153 | The API key to your Meilisearch instance. This key must have read and write permissions for the specified index. 154 | 155 | `meilisearch_index_uid` _mandatory_ 156 | Name of the index on which the content is indexed. 157 | 158 | `stategy` 159 | default: `default` 160 | Scraping strategy: - `default` Scrapes the content of webpages, it is suitable for most use cases. It indexes the content in this format (show example) - `docssearch` Scrapes the content of webpages, it suits most use cases. The difference with the default strategy is that it indexes the content in a format compatible with docs-search bar - `schema` Scraps the [`schema`](https://schema.org/) information of your web app. 161 | 162 | `headless` 163 | default: `true` 164 | Wether or not the javascript should be loaded before scraping starts. 165 | 166 | `primary_key` 167 | The key name in your documents containing their unique identifier. 168 | 169 | `meilisearch_settings` 170 | Your custom Meilisearch settings 171 | 172 | `schema_settings` 173 | If your strategy is `schema`: 174 | `only_type`: Which types of schema should be parsed 175 | `convert_dates`: If dates should be converted to timestamp. This is usefull to be able to order by date. 176 | 177 | `user_agents` 178 | An array of user agents that are append at the end of the current user agents. 179 | In this case, if your `user_agents` value is `['My Thing (vx.x.x)']` the final `user_agent` becomes 180 | 181 | ``` 182 | Meilisearch JS (vx.x.x); Meilisearch Crawler (vx.x.x); My Thing (vx.x.x) 183 | ``` 184 | 185 | `webhook_payload` 186 | In the case that [webhooks](#webhooks) are enabled, the webhook_payload option gives the possibility to provide information that will be added in the webhook payload. 187 | 188 | `webhook_url` 189 | The URL on which the webhook calls are made. 190 | 191 | `additional_request_headers` 192 | An object containing headers to be added to every request the crawler makes. 193 | This can be useful to add authentication headers to crawl protected sites. 194 | 195 | E.g. authenticate crawler with basic auth: 196 | ``` 197 | { 198 | "additional_request_headers": { 199 | "Authorization": "Basic dXNlcjpwYXNzd29yZA==" 200 | } 201 | } 202 | ``` 203 | 204 | ## Webhooks 205 | 206 | To be able to receive updates on the state of the crawler, you need to create a webhook. To do so, you must have a public URL that is reachable by the crawler. This URL will be called by the crawler to send you updates. 207 | 208 | To enable webhooks, you need add the following env vars. 209 | 210 | ```txt 211 | WEBHOOK_URL=https://mywebsite.com/webhook 212 | WEBHOOK_TOKEN=mytoken 213 | WEBHOOK_INTERVAL=1000 214 | ``` 215 | 216 | - The `WEBHOOK_URL` is the URL that will be called by the crawler. The calls will be made with the `POST` method. 217 | - The `WEBHOOK_TOKEN` is a token string that will be used to authenticate the request. It will be used if present in the `Authorization` header of the request in the format `Authorization: Bearer ${token}`. 218 | - The `WEBHOOK_INTERVAL` is a way to change the frequency you want to receive updated from the scraper. The value is in milliseconds. The default value is 5000ms. 219 | 220 | Here is the Webhook payload: 221 | 222 | ```json 223 | { 224 | "date": "2022-01-01T12:34:56.000Z", 225 | "meilisearch_url": "https://myproject.meilisearch.com", 226 | "meilisearch_index_uid": "myindex", 227 | "status": "active", // "added", "completed", "failed", "active", "wait", "delayed" 228 | "nb_page_crawled": 20, 229 | "nb_page_indexed": 15 230 | } 231 | ``` 232 | 233 | It is possible to add additional information in the webhook payload through the `webhook_payload` configuration 234 | 235 | ## Docker 236 | 237 | ### Usage 238 | 239 | ``` 240 | docker run --rm --env-file .env getmeili/scrapix 241 | docker run -rm --env CRAWLER_CONFIG=$CRAWLER_CONFIG getmeili/scrapix 242 | ``` 243 | 244 | ⚠️ Avoid any whitespace inside your CRAWLER_CONFIG. 245 | 246 | ## Publish 247 | 248 | The CI handles the publishing. 249 | -------------------------------------------------------------------------------- /config/nodemon:build.json: -------------------------------------------------------------------------------- 1 | { 2 | "watch": ["src"], 3 | "ext": "ts,json", 4 | "exec": "yarn build" 5 | } 6 | -------------------------------------------------------------------------------- /config/nodemon:default-scrap.json: -------------------------------------------------------------------------------- 1 | { 2 | "watch": ["src", "misc/config_examples/docusaurus-default.json"], 3 | "ext": "ts,json", 4 | "exec": "yarn start -p misc/config_examples/docusaurus-default.json" 5 | } 6 | -------------------------------------------------------------------------------- /config/nodemon:docsearch-scrap.json: -------------------------------------------------------------------------------- 1 | { 2 | "watch": ["src", "misc/config_examples/docusaurus-docsearch.json"], 3 | "ext": "ts,json", 4 | "exec": "yarn start -p misc/config_examples/docusaurus-docsearch.json" 5 | } 6 | -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | # Create a docker compose that run Meilisearch and Scrapix both latest version. 2 | version: '3' 3 | 4 | services: 5 | scrapix: 6 | image: scrapix:latest 7 | restart: always 8 | ports: 9 | - 8080:8080 10 | environment: 11 | - REDIS_URL=redis://redis:6379 12 | 13 | meilisearch: 14 | image: getmeili/meilisearch:latest 15 | restart: always 16 | ports: 17 | - 7700:7700 18 | environment: 19 | - MEILI_NO_ANALYTICS=true 20 | - MEILI_MASTER_KEY=masterKey 21 | - MEILI_ENV=development 22 | 23 | redis: 24 | image: redis:latest 25 | restart: always 26 | ports: 27 | - 6379:6379 28 | -------------------------------------------------------------------------------- /misc/Scrapix.postman_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "_postman_id": "00db6d82-0d03-4800-a721-219a02dd18b3", 4 | "name": "Scrapix", 5 | "description": "Scrapix is an API that will allow you to scrap any website and send the data to Meilisearch.", 6 | "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", 7 | "_exporter_id": "28269" 8 | }, 9 | "item": [ 10 | { 11 | "name": "Crawl - Default - Simple", 12 | "request": { 13 | "method": "POST", 14 | "header": [], 15 | "body": { 16 | "mode": "raw", 17 | "raw": "{\n \"urls\": [\n \"https://platform.openai.com/docs\"\n ],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"openai\"\n}", 18 | "options": { 19 | "raw": { 20 | "language": "json" 21 | } 22 | } 23 | }, 24 | "url": { 25 | "raw": "{{scrapix_url}}/crawl", 26 | "host": [ 27 | "{{scrapix_url}}" 28 | ], 29 | "path": [ 30 | "crawl" 31 | ] 32 | } 33 | }, 34 | "response": [] 35 | }, 36 | { 37 | "name": "Crawl - Default - Excluded URLs", 38 | "request": { 39 | "method": "POST", 40 | "header": [], 41 | "body": { 42 | "mode": "raw", 43 | "raw": "{\n \"urls\": [\n \"https://meilisearch.com/docs\",\n \"https://www.meilisearch.com/docs\"\n ],\n \"urls_to_not_index\": [\"https://blog.meilisearch.com/page\"],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\"\n}", 44 | "options": { 45 | "raw": { 46 | "language": "json" 47 | } 48 | } 49 | }, 50 | "url": { 51 | "raw": "{{scrapix_url}}/crawl", 52 | "host": [ 53 | "{{scrapix_url}}" 54 | ], 55 | "path": [ 56 | "crawl" 57 | ] 58 | } 59 | }, 60 | "response": [] 61 | }, 62 | { 63 | "name": "Crawl - Default - Bigger batches", 64 | "request": { 65 | "method": "POST", 66 | "header": [], 67 | "body": { 68 | "mode": "raw", 69 | "raw": "{\n \"urls\": [\n \"https://meilisearch.com/docs\",\n \"https://www.meilisearch.com/docs\"\n ],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\",\n \"batch_size\": 1000\n}", 70 | "options": { 71 | "raw": { 72 | "language": "json" 73 | } 74 | } 75 | }, 76 | "url": { 77 | "raw": "{{scrapix_url}}/crawl", 78 | "host": [ 79 | "{{scrapix_url}}" 80 | ], 81 | "path": [ 82 | "crawl" 83 | ] 84 | } 85 | }, 86 | "response": [] 87 | }, 88 | { 89 | "name": "Crawl - Docsearch - Simple", 90 | "request": { 91 | "method": "POST", 92 | "header": [], 93 | "body": { 94 | "mode": "raw", 95 | "raw": "{\n \"urls\": [\n \"https://meilisearch.com/docs\",\n \"https://www.meilisearch.com/docs\"\n ],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\",\n \"strategy\": \"docssearch\"\n}", 96 | "options": { 97 | "raw": { 98 | "language": "json" 99 | } 100 | } 101 | }, 102 | "url": { 103 | "raw": "{{scrapix_url}}/crawl", 104 | "host": [ 105 | "{{scrapix_url}}" 106 | ], 107 | "path": [ 108 | "crawl" 109 | ] 110 | } 111 | }, 112 | "response": [] 113 | }, 114 | { 115 | "name": "Crawl - Schema - Simple", 116 | "request": { 117 | "method": "POST", 118 | "header": [], 119 | "body": { 120 | "mode": "raw", 121 | "raw": "{\n \"urls\": [\n \"https://blog.meilisearch.com/\"\n ],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\",\n \"strategy\": \"schema\"\n}", 122 | "options": { 123 | "raw": { 124 | "language": "json" 125 | } 126 | } 127 | }, 128 | "url": { 129 | "raw": "{{scrapix_url}}/crawl", 130 | "host": [ 131 | "{{scrapix_url}}" 132 | ], 133 | "path": [ 134 | "crawl" 135 | ] 136 | } 137 | }, 138 | "response": [] 139 | }, 140 | { 141 | "name": "Crawl - Schema - Config", 142 | "request": { 143 | "method": "POST", 144 | "header": [], 145 | "body": { 146 | "mode": "raw", 147 | "raw": "{\n \"urls\": [\n \"https://blog.meilisearch.com/\"\n ],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\",\n \"strategy\": \"schema\",\n \"schema_settings\": {\n \"only_type\": \"Article\",\n \"convert_dates\": true\n }\n}", 148 | "options": { 149 | "raw": { 150 | "language": "json" 151 | } 152 | } 153 | }, 154 | "url": { 155 | "raw": "{{scrapix_url}}/crawl", 156 | "host": [ 157 | "{{scrapix_url}}" 158 | ], 159 | "path": [ 160 | "crawl" 161 | ] 162 | } 163 | }, 164 | "response": [] 165 | }, 166 | { 167 | "name": "Crawl - Schema - Index Only Urls", 168 | "request": { 169 | "method": "POST", 170 | "header": [], 171 | "body": { 172 | "mode": "raw", 173 | "raw": "{\n \"urls\": [\n \"https://eu.patagonia.com/fr/fr/shop\",\n \"https://eu.patagonia.com/fr/fr/product\"\n ],\n \"urls_to_index\": [\"https://eu.patagonia.com/fr/fr/product\"],\n \"meilisearch_url\": \"{{meilisearch_url}}\",\n \"meilisearch_api_key\": \"{{meilisearch_api_key}}\",\n \"meilisearch_index_uid\": \"{{meilisearch_index_uid}}\",\n \"strategy\": \"schema\",\n \"primary_key\": \"sku\"\n}", 174 | "options": { 175 | "raw": { 176 | "language": "json" 177 | } 178 | } 179 | }, 180 | "url": { 181 | "raw": "{{scrapix_url}}/crawl", 182 | "host": [ 183 | "{{scrapix_url}}" 184 | ], 185 | "path": [ 186 | "crawl" 187 | ] 188 | } 189 | }, 190 | "response": [] 191 | } 192 | ], 193 | "event": [ 194 | { 195 | "listen": "prerequest", 196 | "script": { 197 | "type": "text/javascript", 198 | "exec": [ 199 | "" 200 | ] 201 | } 202 | }, 203 | { 204 | "listen": "test", 205 | "script": { 206 | "type": "text/javascript", 207 | "exec": [ 208 | "" 209 | ] 210 | } 211 | } 212 | ], 213 | "variable": [ 214 | { 215 | "key": "scrapix_url", 216 | "value": "http://localhost:3000", 217 | "type": "string" 218 | }, 219 | { 220 | "key": "meilisearch_url", 221 | "value": "http://localhost:7700", 222 | "type": "string" 223 | }, 224 | { 225 | "key": "meilisearch_api_key", 226 | "value": "masterKey", 227 | "type": "string" 228 | }, 229 | { 230 | "key": "meilisearch_index_uid", 231 | "value": "main", 232 | "type": "string" 233 | } 234 | ] 235 | } 236 | -------------------------------------------------------------------------------- /misc/config_examples/default-bigger_batches.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "https://meilisearch.com/docs", 4 | "https://www.meilisearch.com/docs" 5 | ], 6 | "meilisearch_url": "{{meilisearch_url}}", 7 | "meilisearch_api_key": "{{meilisearch_api_key}}", 8 | "meilisearch_index_uid": "{{meilisearch_index_uid}}", 9 | "batch_size": 1000 10 | } 11 | -------------------------------------------------------------------------------- /misc/config_examples/default-exclude_urls.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "https://meilisearch.com/docs", 4 | "https://www.meilisearch.com/docs" 5 | ], 6 | "urls_to_not_index": ["https://blog.meilisearch.com/page"], 7 | "meilisearch_url": "{{meilisearch_url}}", 8 | "meilisearch_api_key": "{{meilisearch_api_key}}", 9 | "meilisearch_index_uid": "{{meilisearch_index_uid}}" 10 | } 11 | -------------------------------------------------------------------------------- /misc/config_examples/default-simple.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": ["https://platform.openai.com/docs"], 3 | "meilisearch_url": "localhost:7700", 4 | "meilisearch_api_key": "masterKey", 5 | "meilisearch_index_uid": "openai" 6 | } 7 | -------------------------------------------------------------------------------- /misc/config_examples/docsearch-simple.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "https://meilisearch.com/docs", 4 | "https://www.meilisearch.com/docs" 5 | ], 6 | "meilisearch_url": "{{meilisearch_url}}", 7 | "meilisearch_api_key": "{{meilisearch_api_key}}", 8 | "meilisearch_index_uid": "{{meilisearch_index_uid}}", 9 | "strategy": "docssearch" 10 | } 11 | -------------------------------------------------------------------------------- /misc/config_examples/docusaurus-default.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "http://localhost:3000" 4 | ], 5 | "meilisearch_url": "http://localhost:7700", 6 | "meilisearch_api_key": "masterKey", 7 | "meilisearch_index_uid": "docusaurus-default", 8 | "strategy": "default" 9 | } 10 | -------------------------------------------------------------------------------- /misc/config_examples/docusaurus-docsearch.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "http://localhost:3000" 4 | ], 5 | "meilisearch_url": "http://localhost:7700", 6 | "meilisearch_api_key": "masterKey", 7 | "meilisearch_index_uid": "docusaurus-docsearch", 8 | "strategy": "docssearch" 9 | } 10 | -------------------------------------------------------------------------------- /misc/config_examples/openai-docsearch-strat.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": ["https://platform.openai.com/docs"], 3 | "meilisearch_url": "localhost:7700", 4 | "meilisearch_api_key": "masterKey", 5 | "meilisearch_index_uid": "openai", 6 | "strategy": "docssearch", 7 | "meilisearch_settings": { 8 | "distinctAttribute": "content" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /misc/config_examples/schema-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": ["https://blog.meilisearch.com/"], 3 | "meilisearch_url": "{{meilisearch_url}}", 4 | "meilisearch_api_key": "{{meilisearch_api_key}}", 5 | "meilisearch_index_uid": "{{meilisearch_index_uid}}", 6 | "strategy": "schema", 7 | "schema_settings": { 8 | "only_type": "Article", 9 | "convert_dates": true 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /misc/config_examples/schema-indexed_urls.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "https://eu.patagonia.com/fr/fr/shop", 4 | "https://eu.patagonia.com/fr/fr/product" 5 | ], 6 | "urls_to_index": ["https://eu.patagonia.com/fr/fr/product"], 7 | "meilisearch_url": "{{meilisearch_url}}", 8 | "meilisearch_api_key": "{{meilisearch_api_key}}", 9 | "meilisearch_index_uid": "{{meilisearch_index_uid}}", 10 | "strategy": "schema", 11 | "primary_key": "sku" 12 | } 13 | -------------------------------------------------------------------------------- /misc/config_examples/schema-simple.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": ["https://blog.meilisearch.com/"], 3 | "meilisearch_url": "{{meilisearch_url}}", 4 | "meilisearch_api_key": "{{meilisearch_api_key}}", 5 | "meilisearch_index_uid": "{{meilisearch_index_uid}}", 6 | "strategy": "schema" 7 | } 8 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@meilisearch/scrapix", 3 | "version": "0.1.9", 4 | "description": "Automatic scraper and indexer to Meilisearch of any website.", 5 | "main": "dist/src/index.js", 6 | "dependencies": { 7 | "axios": "^1.4.0", 8 | "bull": "^4.10.4", 9 | "crawlee": "^3.0.0", 10 | "dotenv": "^16.0.3", 11 | "express": "^4.18.2", 12 | "instantsearch.css": "^8.0.0", 13 | "meilisearch": "^0.31.1", 14 | "minimatch": "^9.0.1", 15 | "prettier": "^2.8.4", 16 | "puppeteer": "^21.0.1", 17 | "puppeteer-core": "^21.0.1", 18 | "uuid": "^9.0.0", 19 | "yargs": "^17.7.2" 20 | }, 21 | "scripts": { 22 | "build": "yarn tsc", 23 | "start": "yarn tsc && node dist/src/bin/index.js", 24 | "start:prod": "node dist/src/bin/index.js", 25 | "serve": "yarn tsc && node dist/src/server.js", 26 | "dev:build": "nodemon --config ./config/nodemon:build.json", 27 | "dev:ds:scrap": "nodemon --config ./config/nodemon:docsearch-scrap.json", 28 | "dev:default:scrap": "nodemon --config ./config/nodemon:default-scrap.json", 29 | "playground:docsearch": "concurrently \"yarn dev:ds:scrap\" \"yarn playground:start\"", 30 | "playground:default": "concurrently \"yarn dev:default:scrap\" \"yarn playground:start\"", 31 | "lint": "eslint .", 32 | "lint:fix": "eslint . --fix", 33 | "playground:start": "yarn --cwd playground/docusaurus && yarn --cwd playground/docusaurus start", 34 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 35 | }, 36 | "author": "It's not you it's me", 37 | "license": "ISC", 38 | "devDependencies": { 39 | "@apify/log": "^2.1.3", 40 | "@apify/tsconfig": "^0.1.0", 41 | "@types/express": "^4.17.17", 42 | "@types/prettier": "^2.7.3", 43 | "@types/uuid": "^9.0.2", 44 | "@types/yargs": "^17.0.24", 45 | "@typescript-eslint/eslint-plugin": "^5.60.0", 46 | "@typescript-eslint/parser": "^5.60.0", 47 | "concurrently": "^8.2.0", 48 | "eslint": "^8.43.0", 49 | "eslint-config-prettier": "^8.8.0", 50 | "eslint-plugin-jest": "^27.2.2", 51 | "eslint-plugin-prettier": "^4.2.1", 52 | "nodemon": "^2.0.22", 53 | "typescript": "^5.1.3" 54 | }, 55 | "files": [ 56 | "dist", 57 | "src" 58 | ] 59 | } 60 | -------------------------------------------------------------------------------- /playground/docusaurus/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | -------------------------------------------------------------------------------- /playground/docusaurus/.stackblitzrc: -------------------------------------------------------------------------------- 1 | { 2 | "installDependencies": true, 3 | "startCommand": "npm start" 4 | } 5 | -------------------------------------------------------------------------------- /playground/docusaurus/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. 4 | 5 | ### Installation 6 | 7 | ``` 8 | $ yarn 9 | ``` 10 | 11 | ### Local Development 12 | 13 | ``` 14 | $ yarn start 15 | ``` 16 | 17 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 18 | 19 | ### Build 20 | 21 | ``` 22 | $ yarn build 23 | ``` 24 | 25 | This command generates static content into the `build` directory and can be served using any static contents hosting service. 26 | 27 | ### Deployment 28 | 29 | Using SSH: 30 | 31 | ``` 32 | $ USE_SSH=true yarn deploy 33 | ``` 34 | 35 | Not using SSH: 36 | 37 | ``` 38 | $ GIT_USER= yarn deploy 39 | ``` 40 | 41 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. 42 | -------------------------------------------------------------------------------- /playground/docusaurus/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /playground/docusaurus/blog/2019-05-28-first-blog-post.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: first-blog-post 3 | title: First Blog Post 4 | authors: [gaowei] 5 | tags: [hola, docusaurus] 6 | --- 7 | 8 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 9 | -------------------------------------------------------------------------------- /playground/docusaurus/blog/2019-05-29-long-blog-post.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: long-blog-post 3 | title: Long Blog Post 4 | authors: endi 5 | tags: [hello, docusaurus] 6 | --- 7 | 8 | This is the summary of a very long blog post, 9 | 10 | Use a `` comment to limit blog post size in the list view. 11 | 12 | 13 | 14 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 15 | 16 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 17 | 18 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 19 | 20 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 21 | 22 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 23 | 24 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 25 | 26 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 27 | 28 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 29 | 30 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 31 | 32 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 33 | 34 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 35 | 36 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 37 | 38 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 39 | 40 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 41 | 42 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 43 | 44 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque elementum dignissim ultricies. Fusce rhoncus ipsum tempor eros aliquam consequat. Lorem ipsum dolor sit amet 45 | -------------------------------------------------------------------------------- /playground/docusaurus/blog/2021-08-01-mdx-blog-post.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | slug: mdx-blog-post 3 | title: MDX Blog Post 4 | authors: [slorber] 5 | tags: [docusaurus] 6 | --- 7 | 8 | Blog posts support [Docusaurus Markdown features](https://docusaurus.io/docs/markdown-features), such as [MDX](https://mdxjs.com/). 9 | 10 | :::tip 11 | 12 | Use the power of React to create interactive blog posts. 13 | 14 | ```js 15 | 16 | ``` 17 | 18 | 19 | 20 | ::: 21 | -------------------------------------------------------------------------------- /playground/docusaurus/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg -------------------------------------------------------------------------------- /playground/docusaurus/blog/2021-08-26-welcome/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: welcome 3 | title: Welcome 4 | authors: [slorber, yangshun] 5 | tags: [facebook, hello, docusaurus] 6 | --- 7 | 8 | [Docusaurus blogging features](https://docusaurus.io/docs/blog) are powered by the [blog plugin](https://docusaurus.io/docs/api/plugins/@docusaurus/plugin-content-blog). 9 | 10 | Simply add Markdown files (or folders) to the `blog` directory. 11 | 12 | Regular blog authors can be added to `authors.yml`. 13 | 14 | The blog post date can be extracted from filenames, such as: 15 | 16 | - `2019-05-30-welcome.md` 17 | - `2019-05-30-welcome/index.md` 18 | 19 | A blog post folder can be convenient to co-locate blog post images: 20 | 21 | ![Docusaurus Plushie](./docusaurus-plushie-banner.jpeg) 22 | 23 | The blog supports tags as well! 24 | 25 | **And if you don't want a blog**: just delete this directory, and use `blog: false` in your Docusaurus config. 26 | -------------------------------------------------------------------------------- /playground/docusaurus/blog/authors.yml: -------------------------------------------------------------------------------- 1 | endi: 2 | name: Endilie Yacop Sucipto 3 | title: Maintainer of Docusaurus 4 | url: https://github.com/endiliey 5 | image_url: https://github.com/endiliey.png 6 | 7 | yangshun: 8 | name: Yangshun Tay 9 | title: Front End Engineer @ Facebook 10 | url: https://github.com/yangshun 11 | image_url: https://github.com/yangshun.png 12 | 13 | slorber: 14 | name: Sébastien Lorber 15 | title: Docusaurus maintainer 16 | url: https://sebastienlorber.com 17 | image_url: https://github.com/slorber.png 18 | 19 | gaowei: 20 | name: Gao Wei 21 | title: Docusaurus maintainer 22 | url: https://github.com/gaowei 23 | image_url: https://github.com/wgao19.png 24 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Tutorial Intro 6 | 7 | Let's discover **Docusaurus in less than 5 minutes**. 8 | 9 | ## Getting Started 10 | 11 | Get started by **creating a new site**. 12 | 13 | Or **try Docusaurus immediately** with **[docusaurus.new](https://docusaurus.new)**. 14 | 15 | ### What you'll need 16 | 17 | - [Node.js](https://nodejs.org/en/download/) version 16.14 or above: 18 | - When installing Node.js, you are recommended to check all checkboxes related to dependencies. 19 | 20 | ## Generate a new site 21 | 22 | Generate a new Docusaurus site using the **classic template**. 23 | 24 | The classic template will automatically be added to your project after you run the command: 25 | 26 | ```bash 27 | npm init docusaurus@latest my-website classic 28 | ``` 29 | 30 | You can type this command into Command Prompt, Powershell, Terminal, or any other integrated terminal of your code editor. 31 | 32 | The command also installs all necessary dependencies you need to run Docusaurus. 33 | 34 | ## Start your site 35 | 36 | Run the development server: 37 | 38 | ```bash 39 | cd my-website 40 | npm run start 41 | ``` 42 | 43 | The `cd` command changes the directory you're working with. In order to work with your newly created Docusaurus site, you'll need to navigate the terminal there. 44 | 45 | The `npm run start` command builds your website locally and serves it through a development server, ready for you to view at http://localhost:3000/. 46 | 47 | Open `docs/intro.md` (this page) and edit some lines: the site **reloads automatically** and displays your changes. 48 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Tutorial - Basics", 3 | "position": 2, 4 | "link": { 5 | "type": "generated-index", 6 | "description": "5 minutes to learn the most important Docusaurus concepts." 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/congratulations.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 6 3 | --- 4 | 5 | # Congratulations! 6 | 7 | You have just learned the **basics of Docusaurus** and made some changes to the **initial template**. 8 | 9 | Docusaurus has **much more to offer**! 10 | 11 | Have **5 more minutes**? Take a look at **[versioning](../tutorial-extras/manage-docs-versions.md)** and **[i18n](../tutorial-extras/translate-your-site.md)**. 12 | 13 | Anything **unclear** or **buggy** in this tutorial? [Please report it!](https://github.com/facebook/docusaurus/discussions/4610) 14 | 15 | ## What's next? 16 | 17 | - Read the [official documentation](https://docusaurus.io/) 18 | - Modify your site configuration with [`docusaurus.config.js`](https://docusaurus.io/docs/api/docusaurus-config) 19 | - Add navbar and footer items with [`themeConfig`](https://docusaurus.io/docs/api/themes/configuration) 20 | - Add a custom [Design and Layout](https://docusaurus.io/docs/styling-layout) 21 | - Add a [search bar](https://docusaurus.io/docs/search) 22 | - Find inspirations in the [Docusaurus showcase](https://docusaurus.io/showcase) 23 | - Get involved in the [Docusaurus Community](https://docusaurus.io/community/support) 24 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/create-a-blog-post.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 3 3 | --- 4 | 5 | # Create a Blog Post 6 | 7 | Docusaurus creates a **page for each blog post**, but also a **blog index page**, a **tag system**, an **RSS** feed... 8 | 9 | ## Create your first Post 10 | 11 | Create a file at `blog/2021-02-28-greetings.md`: 12 | 13 | ```md title="blog/2021-02-28-greetings.md" 14 | --- 15 | slug: greetings 16 | title: Greetings! 17 | authors: 18 | - name: Joel Marcey 19 | title: Co-creator of Docusaurus 1 20 | url: https://github.com/JoelMarcey 21 | image_url: https://github.com/JoelMarcey.png 22 | - name: Sébastien Lorber 23 | title: Docusaurus maintainer 24 | url: https://sebastienlorber.com 25 | image_url: https://github.com/slorber.png 26 | tags: [greetings] 27 | --- 28 | 29 | Congratulations, you have made your first post! 30 | 31 | Feel free to play around and edit this post as much you like. 32 | ``` 33 | 34 | A new blog post is now available at [http://localhost:3000/blog/greetings](http://localhost:3000/blog/greetings). 35 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/create-a-document.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | --- 4 | 5 | # Create a Document 6 | 7 | Documents are **groups of pages** connected through: 8 | 9 | - a **sidebar** 10 | - **previous/next navigation** 11 | - **versioning** 12 | 13 | ## Create your first Doc 14 | 15 | Create a Markdown file at `docs/hello.md`: 16 | 17 | ```md title="docs/hello.md" 18 | # Hello 19 | 20 | This is my **first Docusaurus document**! 21 | ``` 22 | 23 | A new document is now available at [http://localhost:3000/docs/hello](http://localhost:3000/docs/hello). 24 | 25 | ## Configure the Sidebar 26 | 27 | Docusaurus automatically **creates a sidebar** from the `docs` folder. 28 | 29 | Add metadata to customize the sidebar label and position: 30 | 31 | ```md title="docs/hello.md" {1-4} 32 | --- 33 | sidebar_label: 'Hi!' 34 | sidebar_position: 3 35 | --- 36 | 37 | # Hello 38 | 39 | This is my **first Docusaurus document**! 40 | ``` 41 | 42 | It is also possible to create your sidebar explicitly in `sidebars.js`: 43 | 44 | ```js title="sidebars.js" 45 | module.exports = { 46 | tutorialSidebar: [ 47 | 'intro', 48 | // highlight-next-line 49 | 'hello', 50 | { 51 | type: 'category', 52 | label: 'Tutorial', 53 | items: ['tutorial-basics/create-a-document'], 54 | }, 55 | ], 56 | }; 57 | ``` 58 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/create-a-page.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Create a Page 6 | 7 | Add **Markdown or React** files to `src/pages` to create a **standalone page**: 8 | 9 | - `src/pages/index.js` → `localhost:3000/` 10 | - `src/pages/foo.md` → `localhost:3000/foo` 11 | - `src/pages/foo/bar.js` → `localhost:3000/foo/bar` 12 | 13 | ## Create your first React Page 14 | 15 | Create a file at `src/pages/my-react-page.js`: 16 | 17 | ```jsx title="src/pages/my-react-page.js" 18 | import React from 'react'; 19 | import Layout from '@theme/Layout'; 20 | 21 | export default function MyReactPage() { 22 | return ( 23 | 24 |

My React page

25 |

This is a React page

26 |
27 | ); 28 | } 29 | ``` 30 | 31 | A new page is now available at [http://localhost:3000/my-react-page](http://localhost:3000/my-react-page). 32 | 33 | ## Create your first Markdown Page 34 | 35 | Create a file at `src/pages/my-markdown-page.md`: 36 | 37 | ```mdx title="src/pages/my-markdown-page.md" 38 | # My Markdown page 39 | 40 | This is a Markdown page 41 | ``` 42 | 43 | A new page is now available at [http://localhost:3000/my-markdown-page](http://localhost:3000/my-markdown-page). 44 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/deploy-your-site.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 5 3 | --- 4 | 5 | # Deploy your site 6 | 7 | Docusaurus is a **static-site-generator** (also called **[Jamstack](https://jamstack.org/)**). 8 | 9 | It builds your site as simple **static HTML, JavaScript and CSS files**. 10 | 11 | ## Build your site 12 | 13 | Build your site **for production**: 14 | 15 | ```bash 16 | npm run build 17 | ``` 18 | 19 | The static files are generated in the `build` folder. 20 | 21 | ## Deploy your site 22 | 23 | Test your production build locally: 24 | 25 | ```bash 26 | npm run serve 27 | ``` 28 | 29 | The `build` folder is now served at [http://localhost:3000/](http://localhost:3000/). 30 | 31 | You can now deploy the `build` folder **almost anywhere** easily, **for free** or very small cost (read the **[Deployment Guide](https://docusaurus.io/docs/deployment)**). 32 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-basics/markdown-features.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 4 3 | --- 4 | 5 | # Markdown Features 6 | 7 | Docusaurus supports **[Markdown](https://daringfireball.net/projects/markdown/syntax)** and a few **additional features**. 8 | 9 | ## Front Matter 10 | 11 | Markdown documents have metadata at the top called [Front Matter](https://jekyllrb.com/docs/front-matter/): 12 | 13 | ```text title="my-doc.md" 14 | // highlight-start 15 | --- 16 | id: my-doc-id 17 | title: My document title 18 | description: My document description 19 | slug: /my-custom-url 20 | --- 21 | // highlight-end 22 | 23 | ## Markdown heading 24 | 25 | Markdown text with [links](./hello.md) 26 | ``` 27 | 28 | ## Links 29 | 30 | Regular Markdown links are supported, using url paths or relative file paths. 31 | 32 | ```md 33 | Let's see how to [Create a page](/create-a-page). 34 | ``` 35 | 36 | ```md 37 | Let's see how to [Create a page](./create-a-page.md). 38 | ``` 39 | 40 | **Result:** Let's see how to [Create a page](./create-a-page.md). 41 | 42 | ## Images 43 | 44 | Regular Markdown images are supported. 45 | 46 | You can use absolute paths to reference images in the static directory (`static/img/docusaurus.png`): 47 | 48 | ```md 49 | ![Docusaurus logo](/img/docusaurus.png) 50 | ``` 51 | 52 | ![Docusaurus logo](/img/docusaurus.png) 53 | 54 | You can reference images relative to the current file as well. This is particularly useful to colocate images close to the Markdown files using them: 55 | 56 | ```md 57 | ![Docusaurus logo](./img/docusaurus.png) 58 | ``` 59 | 60 | ## Code Blocks 61 | 62 | Markdown code blocks are supported with Syntax highlighting. 63 | 64 | ```jsx title="src/components/HelloDocusaurus.js" 65 | function HelloDocusaurus() { 66 | return ( 67 |

Hello, Docusaurus!

68 | ) 69 | } 70 | ``` 71 | 72 | ```jsx title="src/components/HelloDocusaurus.js" 73 | function HelloDocusaurus() { 74 | return

Hello, Docusaurus!

; 75 | } 76 | ``` 77 | 78 | ## Admonitions 79 | 80 | Docusaurus has a special syntax to create admonitions and callouts: 81 | 82 | :::tip My tip 83 | 84 | Use this awesome feature option 85 | 86 | ::: 87 | 88 | :::danger Take care 89 | 90 | This action is dangerous 91 | 92 | ::: 93 | 94 | :::tip My tip 95 | 96 | Use this awesome feature option 97 | 98 | ::: 99 | 100 | :::danger Take care 101 | 102 | This action is dangerous 103 | 104 | ::: 105 | 106 | ## MDX and React Components 107 | 108 | [MDX](https://mdxjs.com/) can make your documentation more **interactive** and allows using any **React components inside Markdown**: 109 | 110 | ```jsx 111 | export const Highlight = ({children, color}) => ( 112 | { 121 | alert(`You clicked the color ${color} with label ${children}`) 122 | }}> 123 | {children} 124 | 125 | ); 126 | 127 | This is Docusaurus green ! 128 | 129 | This is Facebook blue ! 130 | ``` 131 | 132 | export const Highlight = ({children, color}) => ( 133 | { 142 | alert(`You clicked the color ${color} with label ${children}`); 143 | }}> 144 | {children} 145 | 146 | ); 147 | 148 | This is Docusaurus green ! 149 | 150 | This is Facebook blue ! 151 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-extras/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Tutorial - Extras", 3 | "position": 3, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-extras/img/docsVersionDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/docs/tutorial-extras/img/docsVersionDropdown.png -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-extras/img/localeDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/docs/tutorial-extras/img/localeDropdown.png -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-extras/manage-docs-versions.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Manage Docs Versions 6 | 7 | Docusaurus can manage multiple versions of your docs. 8 | 9 | ## Create a docs version 10 | 11 | Release a version 1.0 of your project: 12 | 13 | ```bash 14 | npm run docusaurus docs:version 1.0 15 | ``` 16 | 17 | The `docs` folder is copied into `versioned_docs/version-1.0` and `versions.json` is created. 18 | 19 | Your docs now have 2 versions: 20 | 21 | - `1.0` at `http://localhost:3000/docs/` for the version 1.0 docs 22 | - `current` at `http://localhost:3000/docs/next/` for the **upcoming, unreleased docs** 23 | 24 | ## Add a Version Dropdown 25 | 26 | To navigate seamlessly across versions, add a version dropdown. 27 | 28 | Modify the `docusaurus.config.js` file: 29 | 30 | ```js title="docusaurus.config.js" 31 | module.exports = { 32 | themeConfig: { 33 | navbar: { 34 | items: [ 35 | // highlight-start 36 | { 37 | type: 'docsVersionDropdown', 38 | }, 39 | // highlight-end 40 | ], 41 | }, 42 | }, 43 | }; 44 | ``` 45 | 46 | The docs version dropdown appears in your navbar: 47 | 48 | ![Docs Version Dropdown](./img/docsVersionDropdown.png) 49 | 50 | ## Update an existing version 51 | 52 | It is possible to edit versioned docs in their respective folder: 53 | 54 | - `versioned_docs/version-1.0/hello.md` updates `http://localhost:3000/docs/hello` 55 | - `docs/hello.md` updates `http://localhost:3000/docs/next/hello` 56 | -------------------------------------------------------------------------------- /playground/docusaurus/docs/tutorial-extras/translate-your-site.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | --- 4 | 5 | # Translate your site 6 | 7 | Let's translate `docs/intro.md` to French. 8 | 9 | ## Configure i18n 10 | 11 | Modify `docusaurus.config.js` to add support for the `fr` locale: 12 | 13 | ```js title="docusaurus.config.js" 14 | module.exports = { 15 | i18n: { 16 | defaultLocale: 'en', 17 | locales: ['en', 'fr'], 18 | }, 19 | }; 20 | ``` 21 | 22 | ## Translate a doc 23 | 24 | Copy the `docs/intro.md` file to the `i18n/fr` folder: 25 | 26 | ```bash 27 | mkdir -p i18n/fr/docusaurus-plugin-content-docs/current/ 28 | 29 | cp docs/intro.md i18n/fr/docusaurus-plugin-content-docs/current/intro.md 30 | ``` 31 | 32 | Translate `i18n/fr/docusaurus-plugin-content-docs/current/intro.md` in French. 33 | 34 | ## Start your localized site 35 | 36 | Start your site on the French locale: 37 | 38 | ```bash 39 | npm run start -- --locale fr 40 | ``` 41 | 42 | Your localized site is accessible at [http://localhost:3000/fr/](http://localhost:3000/fr/) and the `Getting Started` page is translated. 43 | 44 | :::caution 45 | 46 | In development, you can only use one locale at a same time. 47 | 48 | ::: 49 | 50 | ## Add a Locale Dropdown 51 | 52 | To navigate seamlessly across languages, add a locale dropdown. 53 | 54 | Modify the `docusaurus.config.js` file: 55 | 56 | ```js title="docusaurus.config.js" 57 | module.exports = { 58 | themeConfig: { 59 | navbar: { 60 | items: [ 61 | // highlight-start 62 | { 63 | type: 'localeDropdown', 64 | }, 65 | // highlight-end 66 | ], 67 | }, 68 | }, 69 | }; 70 | ``` 71 | 72 | The locale dropdown now appears in your navbar: 73 | 74 | ![Locale Dropdown](./img/localeDropdown.png) 75 | 76 | ## Build your localized site 77 | 78 | Build your site for a specific locale: 79 | 80 | ```bash 81 | npm run build -- --locale fr 82 | ``` 83 | 84 | Or build your site to include all the locales at once: 85 | 86 | ```bash 87 | npm run build 88 | ``` 89 | -------------------------------------------------------------------------------- /playground/docusaurus/docusaurus.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | // Note: type annotations allow type checking and IDEs autocompletion 3 | 4 | const lightCodeTheme = require('prism-react-renderer/themes/github'); 5 | const darkCodeTheme = require('prism-react-renderer/themes/dracula'); 6 | 7 | /** @type {import('@docusaurus/types').Config} */ 8 | const config = { 9 | title: 'My Site', 10 | tagline: 'Dinosaurs are cool', 11 | favicon: 'img/favicon.ico', 12 | 13 | // Set the production url of your site here 14 | url: 'https://your-docusaurus-test-site.com', 15 | // Set the // pathname under which your site is served 16 | // For GitHub pages deployment, it is often '//' 17 | baseUrl: '/', 18 | 19 | // GitHub pages deployment config. 20 | // If you aren't using GitHub pages, you don't need these. 21 | organizationName: 'facebook', // Usually your GitHub org/user name. 22 | projectName: 'docusaurus', // Usually your repo name. 23 | 24 | onBrokenLinks: 'throw', 25 | onBrokenMarkdownLinks: 'warn', 26 | 27 | // Even if you don't use internalization, you can use this field to set useful 28 | // metadata like html lang. For example, if your site is Chinese, you may want 29 | // to replace "en" with "zh-Hans". 30 | i18n: { 31 | defaultLocale: 'en', 32 | locales: ['en'], 33 | }, 34 | 35 | presets: [ 36 | [ 37 | 'classic', 38 | /** @type {import('@docusaurus/preset-classic').Options} */ 39 | ({ 40 | docs: { 41 | sidebarPath: require.resolve('./sidebars.js'), 42 | // Please change this to your repo. 43 | // Remove this to remove the "edit this page" links. 44 | editUrl: 45 | 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', 46 | }, 47 | blog: { 48 | showReadingTime: true, 49 | // Please change this to your repo. 50 | // Remove this to remove the "edit this page" links. 51 | editUrl: 52 | 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', 53 | }, 54 | theme: { 55 | customCss: require.resolve('./src/css/custom.css'), 56 | }, 57 | }), 58 | ], 59 | ], 60 | 61 | themeConfig: 62 | /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ 63 | ({ 64 | // Replace with your project's social card 65 | image: 'img/docusaurus-social-card.jpg', 66 | navbar: { 67 | title: 'My Site', 68 | logo: { 69 | alt: 'My Site Logo', 70 | src: 'img/logo.svg', 71 | }, 72 | items: [ 73 | { 74 | type: 'docSidebar', 75 | sidebarId: 'tutorialSidebar', 76 | position: 'left', 77 | label: 'Tutorial', 78 | }, 79 | {to: '/blog', label: 'Blog', position: 'left'}, 80 | { 81 | href: 'https://github.com/facebook/docusaurus', 82 | label: 'GitHub', 83 | position: 'right', 84 | }, 85 | ], 86 | }, 87 | footer: { 88 | style: 'dark', 89 | links: [ 90 | { 91 | title: 'Docs', 92 | items: [ 93 | { 94 | label: 'Tutorial', 95 | to: '/docs/intro', 96 | }, 97 | ], 98 | }, 99 | { 100 | title: 'Community', 101 | items: [ 102 | { 103 | label: 'Stack Overflow', 104 | href: 'https://stackoverflow.com/questions/tagged/docusaurus', 105 | }, 106 | { 107 | label: 'Discord', 108 | href: 'https://discordapp.com/invite/docusaurus', 109 | }, 110 | { 111 | label: 'Twitter', 112 | href: 'https://twitter.com/docusaurus', 113 | }, 114 | ], 115 | }, 116 | { 117 | title: 'More', 118 | items: [ 119 | { 120 | label: 'Blog', 121 | to: '/blog', 122 | }, 123 | { 124 | label: 'GitHub', 125 | href: 'https://github.com/facebook/docusaurus', 126 | }, 127 | ], 128 | }, 129 | ], 130 | copyright: `Copyright © ${new Date().getFullYear()} My Project, Inc. Built with Docusaurus.`, 131 | }, 132 | prism: { 133 | theme: lightCodeTheme, 134 | darkTheme: darkCodeTheme, 135 | }, 136 | }), 137 | }; 138 | 139 | module.exports = config; 140 | -------------------------------------------------------------------------------- /playground/docusaurus/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "docusaurus-classic-typescript", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "serve": "docusaurus serve", 13 | "write-translations": "docusaurus write-translations", 14 | "write-heading-ids": "docusaurus write-heading-ids", 15 | "typecheck": "tsc", 16 | "dev": "docusaurus start" 17 | }, 18 | "dependencies": { 19 | "@docusaurus/core": "^3.7.0", 20 | "@docusaurus/preset-classic": "^3.7.0", 21 | "@mdx-js/react": "^3.0.0", 22 | "@meilisearch/instant-meilisearch": "^0.13.3", 23 | "clsx": "^1.2.1", 24 | "meilisearch-docsearch": "^0.4.7", 25 | "prism-react-renderer": "^1.3.5", 26 | "react": "^18.3.1", 27 | "react-dom": "^18.3.1", 28 | "react-instantsearch-dom": "^6.40.1" 29 | }, 30 | "devDependencies": { 31 | "@docusaurus/module-type-aliases": "^3.7.0", 32 | "@tsconfig/docusaurus": "^1.0.5", 33 | "typescript": "^4.7.4" 34 | }, 35 | "browserslist": { 36 | "production": [ 37 | ">0.5%", 38 | "not dead", 39 | "not op_mini all" 40 | ], 41 | "development": [ 42 | "last 1 chrome version", 43 | "last 1 firefox version", 44 | "last 1 safari version" 45 | ] 46 | }, 47 | "engines": { 48 | "node": ">=16.14" 49 | }, 50 | "description": "Docusaurus example project (classic-typescript template)" 51 | } 52 | -------------------------------------------------------------------------------- /playground/docusaurus/sandbox.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "infiniteLoopProtection": true, 3 | "hardReloadOnChange": true, 4 | "view": "browser", 5 | "template": "docusaurus", 6 | "node": "16", 7 | "container": { 8 | "node": "16" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /playground/docusaurus/scrapix-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "start_urls": [ 3 | "http://localhost:3000/" 4 | ], 5 | "meilisearch_url": "http://localhost:7700", 6 | "meilisearch_api_key": "masterKey", 7 | "meilisearch_index_uid": "scrapix_playground", 8 | "strategy": "docssearch", 9 | "headless": false 10 | } 11 | -------------------------------------------------------------------------------- /playground/docusaurus/sidebars.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Creating a sidebar enables you to: 3 | - create an ordered group of docs 4 | - render a sidebar for each doc of that group 5 | - provide next/previous navigation 6 | 7 | The sidebars can be generated from the filesystem, or explicitly defined here. 8 | 9 | Create as many sidebars as you want. 10 | */ 11 | 12 | // @ts-check 13 | 14 | /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ 15 | const sidebars = { 16 | // By default, Docusaurus generates a sidebar from the docs folder structure 17 | tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], 18 | 19 | // But you can create a sidebar manually 20 | /* 21 | tutorialSidebar: [ 22 | 'intro', 23 | 'hello', 24 | { 25 | type: 'category', 26 | label: 'Tutorial', 27 | items: ['tutorial-basics/create-a-document'], 28 | }, 29 | ], 30 | */ 31 | }; 32 | 33 | module.exports = sidebars; 34 | -------------------------------------------------------------------------------- /playground/docusaurus/src/components/HomepageFeatures/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import styles from './styles.module.css'; 4 | 5 | type FeatureItem = { 6 | title: string; 7 | Svg: React.ComponentType>; 8 | description: JSX.Element; 9 | }; 10 | 11 | const FeatureList: FeatureItem[] = [ 12 | { 13 | title: 'Easy to Use', 14 | Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, 15 | description: ( 16 | <> 17 | Docusaurus was designed from the ground up to be easily installed and 18 | used to get your website up and running quickly. 19 | 20 | ), 21 | }, 22 | { 23 | title: 'Focus on What Matters', 24 | Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, 25 | description: ( 26 | <> 27 | Docusaurus lets you focus on your docs, and we'll do the chores. Go 28 | ahead and move your docs into the docs directory. 29 | 30 | ), 31 | }, 32 | { 33 | title: 'Powered by React', 34 | Svg: require('@site/static/img/undraw_docusaurus_react.svg').default, 35 | description: ( 36 | <> 37 | Extend or customize your website layout by reusing React. Docusaurus can 38 | be extended while reusing the same header and footer. 39 | 40 | ), 41 | }, 42 | ]; 43 | 44 | function Feature({title, Svg, description}: FeatureItem) { 45 | return ( 46 |
47 |
48 | 49 |
50 |
51 |

{title}

52 |

{description}

53 |
54 |
55 | ); 56 | } 57 | 58 | export default function HomepageFeatures(): JSX.Element { 59 | return ( 60 |
61 |
62 |
63 | {FeatureList.map((props, idx) => ( 64 | 65 | ))} 66 |
67 |
68 |
69 | ); 70 | } 71 | -------------------------------------------------------------------------------- /playground/docusaurus/src/components/HomepageFeatures/styles.module.css: -------------------------------------------------------------------------------- 1 | .features { 2 | display: flex; 3 | align-items: center; 4 | padding: 2rem 0; 5 | width: 100%; 6 | } 7 | 8 | .featureSvg { 9 | height: 200px; 10 | width: 200px; 11 | } 12 | -------------------------------------------------------------------------------- /playground/docusaurus/src/css/custom.css: -------------------------------------------------------------------------------- 1 | /** 2 | * Any CSS included here will be global. The classic template 3 | * bundles Infima by default. Infima is a CSS framework designed to 4 | * work well for content-centric websites. 5 | */ 6 | 7 | /* You can override the default Infima variables here. */ 8 | :root { 9 | --ifm-color-primary: #2e8555; 10 | --ifm-color-primary-dark: #29784c; 11 | --ifm-color-primary-darker: #277148; 12 | --ifm-color-primary-darkest: #205d3b; 13 | --ifm-color-primary-light: #33925d; 14 | --ifm-color-primary-lighter: #359962; 15 | --ifm-color-primary-lightest: #3cad6e; 16 | --ifm-code-font-size: 95%; 17 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); 18 | } 19 | 20 | /* For readability concerns, you should choose a lighter palette in dark mode. */ 21 | [data-theme='dark'] { 22 | --ifm-color-primary: #25c2a0; 23 | --ifm-color-primary-dark: #21af90; 24 | --ifm-color-primary-darker: #1fa588; 25 | --ifm-color-primary-darkest: #1a8870; 26 | --ifm-color-primary-light: #29d5b0; 27 | --ifm-color-primary-lighter: #32d8b4; 28 | --ifm-color-primary-lightest: #4fddbf; 29 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); 30 | } 31 | -------------------------------------------------------------------------------- /playground/docusaurus/src/pages/DefaultSearchBar.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import 'instantsearch.css/themes/algolia-min.css' 3 | import { InstantSearch, SearchBox, Hits, Highlight, Configure } from 'react-instantsearch-dom' 4 | 5 | import { instantMeiliSearch } from '@meilisearch/instant-meilisearch'; 6 | // import '@meilisearch/instant-meilisearch/template.css' 7 | 8 | const searchClient = instantMeiliSearch("http://localhost:7700", "masterKey", { primaryKey: "uid" }); 9 | 10 | const Hit = ({ hit }) => { 11 | return (
12 |
13 | 14 |
15 |
16 | 17 |
18 |
19 | 20 |
21 |
22 | 23 |
24 |
) 25 | } 26 | 27 | const CustomPage: React.FC = () => { 28 | 29 | 30 | React.useEffect(() => { 31 | const searchInput = document.querySelector(".ais-SearchBox input") as HTMLInputElement 32 | if (searchInput) { 33 | searchInput.focus() 34 | } 35 | }, []); 36 | 37 | return ( 38 |
39 |

Docusaurus with default strategy

40 | 41 | 46 | 47 | 48 | 49 |
50 | ); 51 | }; 52 | 53 | export default CustomPage; 54 | -------------------------------------------------------------------------------- /playground/docusaurus/src/pages/index.module.css: -------------------------------------------------------------------------------- 1 | /** 2 | * CSS files with the .module.css suffix will be treated as CSS modules 3 | * and scoped locally. 4 | */ 5 | 6 | .heroBanner { 7 | padding: 4rem 0; 8 | text-align: center; 9 | position: relative; 10 | overflow: hidden; 11 | } 12 | 13 | @media screen and (max-width: 996px) { 14 | .heroBanner { 15 | padding: 2rem; 16 | } 17 | } 18 | 19 | .buttons { 20 | display: flex; 21 | align-items: center; 22 | justify-content: center; 23 | } 24 | -------------------------------------------------------------------------------- /playground/docusaurus/src/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import Link from '@docusaurus/Link'; 4 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 5 | import Layout from '@theme/Layout'; 6 | import HomepageFeatures from '@site/src/components/HomepageFeatures'; 7 | 8 | import styles from './index.module.css'; 9 | 10 | function HomepageHeader() { 11 | const { siteConfig } = useDocusaurusContext(); 12 | return ( 13 |
14 |
15 |

{siteConfig.title}

16 |

{siteConfig.tagline}

17 |
18 | 21 | Docusaurus Tutorial - 5min ⏱️ 22 | 23 |
24 |
25 |
26 | ); 27 | } 28 | 29 | export default function Home(): JSX.Element { 30 | const { siteConfig } = useDocusaurusContext(); 31 | return ( 32 | 35 | 36 |
37 | 38 |
39 |
40 | ); 41 | } 42 | -------------------------------------------------------------------------------- /playground/docusaurus/src/pages/markdown-page.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Markdown page example 3 | --- 4 | 5 | # Markdown page example 6 | 7 | You don't need React to write simple standalone pages. 8 | -------------------------------------------------------------------------------- /playground/docusaurus/src/theme/SearchBar/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | 3 | import 'meilisearch-docsearch/css' 4 | 5 | const SearchPage = () => { 6 | const docsearchRef = React.useRef(null); 7 | 8 | React.useEffect(() => { 9 | const docsearch = require('meilisearch-docsearch').default 10 | const destroy = docsearch({ 11 | host: 'http://localhost:7700', 12 | apiKey: 13 | 'masterKey', 14 | indexUid: 'docusaurus-docsearch', 15 | container: '#docsearch', 16 | debug: true 17 | }) 18 | 19 | return () => destroy() 20 | }, []) 21 | 22 | React.useEffect(() => { 23 | docsearchRef.current.firstChild.click(); 24 | const elem = document.querySelector(".docsearch-modal-search-input") as HTMLInputElement 25 | if (elem) { 26 | elem.focus(); 27 | elem.value = "g" 28 | elem.dispatchEvent(new Event('input', { bubbles: true })); // Trigger input event 29 | } 30 | return () => { 31 | if (docsearchRef.current) { 32 | docsearchRef.current.unsubscribe(); 33 | } 34 | }; 35 | }, []); 36 | 37 | return ( 38 |
39 |
40 |
41 | ) 42 | } 43 | 44 | export default SearchPage 45 | -------------------------------------------------------------------------------- /playground/docusaurus/static/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/static/.nojekyll -------------------------------------------------------------------------------- /playground/docusaurus/static/img/docusaurus-social-card.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/static/img/docusaurus-social-card.jpg -------------------------------------------------------------------------------- /playground/docusaurus/static/img/docusaurus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/static/img/docusaurus.png -------------------------------------------------------------------------------- /playground/docusaurus/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meilisearch/scrapix/baf51c7cf005ef6869294a4d44b5f79790dcf0b0/playground/docusaurus/static/img/favicon.ico -------------------------------------------------------------------------------- /playground/docusaurus/static/img/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/docusaurus/static/img/undraw_docusaurus_mountain.svg: -------------------------------------------------------------------------------- 1 | 2 | Easy to Use 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /playground/docusaurus/static/img/undraw_docusaurus_react.svg: -------------------------------------------------------------------------------- 1 | 2 | Powered by React 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /playground/docusaurus/static/img/undraw_docusaurus_tree.svg: -------------------------------------------------------------------------------- 1 | 2 | Focus on What Matters 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /playground/docusaurus/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | // This file is not used in compilation. It is here just for a nice editor experience. 3 | "extends": "@tsconfig/docusaurus/tsconfig.json", 4 | "compilerOptions": { 5 | "baseUrl": ".", 6 | "jsx": "react" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/bin/index.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from 'dotenv' 2 | dotenv.config() 3 | 4 | import fs from 'fs' 5 | import yargs from 'yargs' 6 | import { hideBin } from 'yargs/helpers' 7 | import { Sender } from '../sender' 8 | import { Crawler } from '../crawler' 9 | import { Config } from '../types' 10 | 11 | function getConfig({ 12 | configPath, 13 | config, 14 | }: { 15 | configPath?: string 16 | config?: string 17 | }): Config { 18 | if (configPath) { 19 | return JSON.parse( 20 | fs.readFileSync(configPath, { encoding: 'utf-8' }) 21 | ) as Config 22 | } else if (config) { 23 | return JSON.parse(config) as Config 24 | } 25 | 26 | throw new Error('Please provide either --config or --configPath') 27 | } 28 | 29 | // eslint-disable-next-line @typescript-eslint/no-floating-promises 30 | ;(async () => { 31 | // Parse command line arguments and get a configuration file path 32 | const argv = await yargs(hideBin(process.argv)) 33 | .option('config', { 34 | alias: 'c', 35 | describe: 'configuration', 36 | type: 'string', 37 | }) 38 | .option('configPath', { 39 | alias: 'p', 40 | describe: 'Path to configuration file', 41 | type: 'string', 42 | }) 43 | .option('browserPath', { 44 | alias: 'b', 45 | describe: 'Path to browser binary', 46 | type: 'string', 47 | }) 48 | .check((argv) => { 49 | if (argv.config && argv.configPath) { 50 | throw new Error( 51 | 'You can only use either --config or --configPath, not both.' 52 | ) 53 | } else if (!argv.config && !argv.configPath) { 54 | throw new Error('You must provide one of --config or --configPath.') 55 | } 56 | return true 57 | }).argv 58 | 59 | const config = getConfig(argv) 60 | const launchOptions = argv.browserPath 61 | ? { executablePath: argv.browserPath } 62 | : {} 63 | 64 | const sender = new Sender(config) 65 | await sender.init() 66 | 67 | const crawler = new Crawler(sender, config, launchOptions) 68 | 69 | await crawler.run() 70 | await sender.finish() 71 | })() 72 | -------------------------------------------------------------------------------- /src/crawler.ts: -------------------------------------------------------------------------------- 1 | import { 2 | createPuppeteerRouter, 3 | PuppeteerCrawler, 4 | Router, 5 | PuppeteerCrawlingContext, 6 | PuppeteerCrawlerOptions, 7 | RequestQueue, 8 | PuppeteerHook, 9 | } from 'crawlee' 10 | 11 | import { minimatch } from 'minimatch' 12 | import DefaultScraper from './scrapers/default' 13 | import DocsearchScraper from './scrapers/docssearch' 14 | import SchemaScraper from './scrapers/schema' 15 | import { Sender } from './sender' 16 | import { Config, Scraper } from './types' 17 | import { Webhook } from './webhook.js' 18 | import { 19 | BrowserLaunchArgumentOptions, 20 | LaunchOptions, 21 | PuppeteerNode, 22 | } from 'puppeteer-core' 23 | 24 | type DefaultHandler = Parameters< 25 | Parameters['addDefaultHandler']>[0] 26 | >[0] 27 | 28 | // Crawler class 29 | // This class is responsible for crawling the urls and extract content to send to Meilisearch 30 | // It uses the createPuppeteerRouter method to create a router that will be used by the PuppeteerCrawler. 31 | // The constructor take a Sender object as a parameter 32 | export class Crawler { 33 | sender: Sender 34 | config: Config 35 | urls: string[] 36 | scraper: Scraper 37 | nb_page_crawled = 0 38 | nb_page_indexed = 0 39 | launchOptions: Record = {} 40 | launcher?: PuppeteerNode 41 | 42 | constructor( 43 | sender: Sender, 44 | config: Config, 45 | launchOptions: Record = {}, 46 | launcher?: PuppeteerNode 47 | ) { 48 | this.sender = sender 49 | this.config = config 50 | this.urls = config.start_urls 51 | this.launchOptions = launchOptions 52 | this.launcher = launcher 53 | 54 | this.scraper = 55 | this.config.strategy == 'docssearch' 56 | ? new DocsearchScraper(this.sender, this.config) 57 | : this.config.strategy == 'schema' 58 | ? new SchemaScraper(this.sender, this.config) 59 | : new DefaultScraper(this.sender, this.config) 60 | } 61 | 62 | async run() { 63 | const requestQueue = await RequestQueue.open(JSON.stringify(this.urls)) 64 | // Enqueue the initial requests 65 | await requestQueue.addRequests(this.urls.map((url) => ({ url }))) 66 | 67 | //Create the router 68 | const router = createPuppeteerRouter() 69 | 70 | // type DefaultHandler = Parameters[0]; 71 | router.addDefaultHandler(this.defaultHandler.bind(this)) 72 | 73 | const preNavigationHooks: PuppeteerHook[] = this.config 74 | .additional_request_headers 75 | ? [ 76 | async (crawlingContext) => { 77 | await crawlingContext.addInterceptRequestHandler( 78 | async (request) => { 79 | return await request.continue({ 80 | headers: { 81 | ...request.headers(), 82 | ...this.config.additional_request_headers, 83 | }, 84 | }) 85 | } 86 | ) 87 | }, 88 | ] 89 | : [] 90 | 91 | const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = { 92 | requestQueue, 93 | requestHandler: router, 94 | preNavigationHooks: preNavigationHooks, 95 | launchContext: { 96 | launchOptions: { 97 | headless: this.config.headless ?? 'new', 98 | args: ['--no-sandbox', '--disable-setuid-sandbox'], 99 | ignoreDefaultArgs: ['--disable-extensions'], 100 | ...this.launchOptions, 101 | }, 102 | } as LaunchOptions & BrowserLaunchArgumentOptions, 103 | } 104 | 105 | if (puppeteerCrawlerOptions.launchContext && this.launcher) { 106 | puppeteerCrawlerOptions.launchContext.launcher = this.launcher 107 | } 108 | // create the crawler 109 | const crawler = new PuppeteerCrawler(puppeteerCrawlerOptions) 110 | 111 | let interval = 5000 112 | if (process.env.WEBHOOK_INTERVAL) { 113 | interval = parseInt(process.env.WEBHOOK_INTERVAL) 114 | } 115 | 116 | const intervalId = setInterval(async () => { 117 | await Webhook.get(this.config).active(this.config, { 118 | nb_page_crawled: this.nb_page_crawled, 119 | nb_page_indexed: this.nb_page_indexed, 120 | nb_documents_sent: this.sender.nb_documents_sent, 121 | }) 122 | }, interval) 123 | 124 | try { 125 | await crawler.run() 126 | 127 | await Webhook.get(this.config).active(this.config, { 128 | nb_page_crawled: this.nb_page_crawled, 129 | nb_page_indexed: this.nb_page_indexed, 130 | nb_documents_sent: this.sender.nb_documents_sent, 131 | }) 132 | } catch (err) { 133 | await Webhook.get(this.config).failed(this.config, err as Error) 134 | } finally { 135 | clearInterval(intervalId) 136 | } 137 | await requestQueue.drop() 138 | } 139 | 140 | // Should we use `log` 141 | async defaultHandler({ request, enqueueLinks, page }: DefaultHandler) { 142 | this.nb_page_crawled++ 143 | const title = await page.title() 144 | console.log(`${title}`, { url: request.loadedUrl }) 145 | const crawled_globs = this.__generate_globs(this.urls) 146 | const excluded_crawled_globs = this.__generate_globs( 147 | this.config.urls_to_exclude || [] 148 | ) 149 | const indexed_globs = this.__generate_globs( 150 | this.config.urls_to_index || this.urls 151 | ) 152 | const excluded_indexed_globs = this.__generate_globs( 153 | this.config.urls_to_not_index || [] 154 | ) 155 | 156 | if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) { 157 | //check if the url is in the list of urls to scrap 158 | if ( 159 | this.__match_globs(request.loadedUrl, indexed_globs) && 160 | !this.__match_globs(request.loadedUrl, excluded_indexed_globs) 161 | ) { 162 | this.nb_page_indexed++ 163 | await this.scraper.get(request.loadedUrl, page) 164 | } 165 | } 166 | 167 | await enqueueLinks({ 168 | globs: crawled_globs, 169 | exclude: excluded_crawled_globs, 170 | transformRequestFunction: (req) => { 171 | // exclude all links that are files not parsable by puppeteer 172 | if (this.__is_file_url(req.url)) { 173 | return false 174 | } 175 | // remove all query params to avoid duplicates 176 | const urlObject = new URL(req.url) 177 | urlObject.search = '' 178 | // Remove all anchors to avoid duplicates 179 | urlObject.hash = '' 180 | req.url = urlObject.toString() 181 | 182 | return req 183 | }, 184 | }) 185 | } 186 | 187 | __generate_globs(urls: string[]) { 188 | return urls.map((url) => { 189 | if (url.endsWith('/')) { 190 | return url + '**' 191 | } 192 | return url + '/**' 193 | }) 194 | } 195 | 196 | __match_globs(url: string, globs: string[]) { 197 | return globs.some((glob) => minimatch(url, glob)) 198 | } 199 | 200 | __is_file_url(url: string) { 201 | const fileExtensions = [ 202 | '.zip', 203 | '.pdf', 204 | '.doc', 205 | '.docx', 206 | '.xls', 207 | '.xlsx', 208 | '.ppt', 209 | '.pptx', 210 | '.rar', 211 | '.tar', 212 | '.gz', 213 | '.tgz', 214 | '.7z', 215 | '.bz2', 216 | '.jpg', 217 | '.jpeg', 218 | '.png', 219 | '.gif', 220 | '.svg', 221 | '.css', 222 | '.js', 223 | '.xml', 224 | '.txt', 225 | '.csv', 226 | '.rtf', 227 | '.mp3', 228 | '.wav', 229 | '.mp4', 230 | '.avi', 231 | '.mkv', 232 | '.mov', 233 | '.flv', 234 | '.wmv', 235 | '.m4v', 236 | '.ogg', 237 | '.mpg', 238 | '.mpeg', 239 | '.swf', 240 | ] 241 | return fileExtensions.some((extension) => url.endsWith(extension)) 242 | } 243 | 244 | __is_paginated_url(url: string) { 245 | const urlObject = new URL(url) 246 | const pathname = urlObject.pathname 247 | return /\/\d+\//.test(pathname) 248 | } 249 | } 250 | -------------------------------------------------------------------------------- /src/crawler_process.ts: -------------------------------------------------------------------------------- 1 | import { Sender } from './sender' 2 | import { Crawler } from './crawler' 3 | import { Config } from './types' 4 | 5 | async function startCrawling(config: Config) { 6 | const sender = new Sender(config) 7 | await sender.init() 8 | 9 | const crawler = new Crawler(sender, config) 10 | 11 | await crawler.run() 12 | await sender.finish() 13 | } 14 | 15 | // Listen for messages from the parent thread 16 | process.on('message', async (message: Config) => { 17 | await startCrawling(message) 18 | if (process.send) { 19 | process.send('Crawling finished') 20 | } 21 | }) 22 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export { Crawler } from './crawler' 2 | export { Sender } from './sender' 3 | export { TaskQueue } from './taskQueue' 4 | -------------------------------------------------------------------------------- /src/meilisearch_client.ts: -------------------------------------------------------------------------------- 1 | import { Config, MeiliSearch } from 'meilisearch' 2 | import { PACKAGE_VERSION } from './package_version' 3 | 4 | export function initMeilisearchClient({ 5 | host, 6 | apiKey, 7 | clientAgents = [], 8 | }: Config) { 9 | return new MeiliSearch({ 10 | host, 11 | apiKey, 12 | clientAgents: [ 13 | `Meilisearch Crawler (v${PACKAGE_VERSION})`, 14 | ...clientAgents, 15 | ], 16 | }) 17 | } 18 | -------------------------------------------------------------------------------- /src/package_version.ts: -------------------------------------------------------------------------------- 1 | export const PACKAGE_VERSION = '0.1.7' 2 | -------------------------------------------------------------------------------- /src/scrapers/default.ts: -------------------------------------------------------------------------------- 1 | import prettier from 'prettier' 2 | import { v4 as uuidv4 } from 'uuid' 3 | import { Sender } from '../sender' 4 | import { Config, Meta, DefaultDocument } from '../types' 5 | import { Page } from 'puppeteer' 6 | 7 | export default class DefaultScraper { 8 | sender: Sender 9 | settings: Config['meilisearch_settings'] 10 | 11 | constructor(sender: Sender, config: Config) { 12 | console.info('DefaultScraper::constructor') 13 | this.sender = sender 14 | this.settings = config.meilisearch_settings || { 15 | searchableAttributes: [ 16 | 'h1', 17 | 'h2', 18 | 'h3', 19 | 'h4', 20 | 'h5', 21 | 'h6', 22 | 'p', 23 | 'title', 24 | 'meta.description', 25 | ], 26 | filterableAttributes: ['urls_tags'], 27 | distinctAttribute: 'url', 28 | } 29 | void this.sender.updateSettings(this.settings) 30 | } 31 | 32 | async get(url: string, page: Page) { 33 | const title = await page.title() 34 | //get the meta of the page 35 | const meta = await this._extract_metadata_from_page(page) 36 | 37 | //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset 38 | let data: DefaultDocument = {} as DefaultDocument 39 | let elems = await page.$$( 40 | 'main h1, main h2, main h3, main h4, main h5, main h6, main p, main td, main li, main span' 41 | ) 42 | if (elems.length === 0) { 43 | elems = await page.$$('h1, h2, h3, h4, h5, h6, p, td, li, span') 44 | } 45 | let page_block = 0 46 | for (let i = 0; i < elems.length; i++) { 47 | const elem = elems[i] 48 | const tag = await elem.evaluate((el) => el.tagName) 49 | let text = (await elem.evaluate((el) => el.textContent)) || '' 50 | text = this._clean_text(text) 51 | data.uid = uuidv4() 52 | data.url = url 53 | data.title = title 54 | data.meta = meta 55 | data.image_url = this._get_image_url_from_meta(meta) 56 | data.page_block = page_block 57 | const urls_tags = new URL(url).pathname.split('/') 58 | data.urls_tags = urls_tags.slice(1, urls_tags.length - 1) 59 | 60 | const id = await elem.evaluate((el) => el.id) 61 | if (tag === 'H1') { 62 | if (data['h1']) { 63 | await this._add_data(data) 64 | page_block++ 65 | data = {} as DefaultDocument 66 | } 67 | data['h1'] = text 68 | data.anchor = '#' + id 69 | } else if (tag === 'H2') { 70 | if (data['h2']) { 71 | await this._add_data(data) 72 | page_block++ 73 | data = { h1: data['h1'] } as DefaultDocument 74 | } 75 | data.anchor = '#' + id 76 | data['h2'] = text 77 | } else if (tag === 'H3') { 78 | if (data['h3']) { 79 | await this._add_data(data) 80 | page_block++ 81 | data = { h1: data['h1'], h2: data['h2'] } as DefaultDocument 82 | } 83 | data.anchor = '#' + id 84 | data['h3'] = text 85 | } else if (tag === 'H4') { 86 | if (data['h4']) { 87 | await this._add_data(data) 88 | page_block++ 89 | data = { 90 | h1: data['h1'], 91 | h2: data['h2'], 92 | h3: data['h3'], 93 | } as DefaultDocument 94 | } 95 | data.anchor = '#' + id 96 | data['h4'] = text 97 | } else if (tag === 'H5') { 98 | if (data['h5']) { 99 | await this._add_data(data) 100 | page_block++ 101 | data = { 102 | h1: data['h1'], 103 | h2: data['h2'], 104 | h3: data['h3'], 105 | h4: data['h4'], 106 | } as DefaultDocument 107 | } 108 | data.anchor = '#' + id 109 | data['h5'] = text 110 | } else if (tag === 'H6') { 111 | if (data['h6']) { 112 | await this._add_data(data) 113 | page_block++ 114 | data = { 115 | h1: data['h1'], 116 | h2: data['h2'], 117 | h3: data['h3'], 118 | h4: data['h4'], 119 | h5: data['h5'], 120 | } as DefaultDocument 121 | } 122 | data.anchor = '#' + id 123 | data['h6'] = text 124 | } else if ( 125 | tag === 'P' || 126 | tag === 'TD' || 127 | tag === 'LI' || 128 | tag === 'SPAN' 129 | ) { 130 | if (!data['p']) { 131 | data['p'] = [] 132 | } 133 | // TODO: should we leave `null` values in the `p` array? 134 | if (text && Array.isArray(data['p']) && !data['p'].includes(text)) { 135 | data['p'].push(text) 136 | } 137 | } 138 | if (i === elems.length - 1) { 139 | await this._add_data(data) 140 | } 141 | } 142 | } 143 | 144 | async _add_data(data: DefaultDocument) { 145 | if (Array.isArray(data['p'])) { 146 | data['p'] = data['p'].join('\n') 147 | } 148 | await this.sender.add(data) 149 | } 150 | 151 | // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and 152 | // remove '# ' from the beginning of the text 153 | _clean_text(text: string) { 154 | text = text.replace(/[\r\n]+/gm, ' ') 155 | ///remove multiple spaces 156 | text = text.replace(/\s+/g, ' ') 157 | ///remove '# ' 158 | text = text.replace('# ', '') 159 | /// Trim leading and trailing spaces 160 | text = text.replace(/^\s+|\s+$/g, '') 161 | return text 162 | } 163 | 164 | // Extract the meta of a page 165 | async _extract_metadata_from_page(page: Page) { 166 | return await page.evaluate(() => { 167 | const metas = document.getElementsByTagName('meta') 168 | const meta: Meta = {} as Meta 169 | for (let i = 0; i < metas.length; i++) { 170 | const name = metas[i].getAttribute('name') 171 | const content = metas[i].getAttribute('content') 172 | if (name && content) { 173 | meta[name] = content 174 | } 175 | } 176 | return meta 177 | }) 178 | } 179 | 180 | // Extract the image url from the meta of a page 181 | _get_image_url_from_meta(meta: Meta) { 182 | if (meta['og:image']) { 183 | return meta['og:image'] 184 | } else if (meta['twitter:image']) { 185 | return meta['twitter:image'] 186 | } else if (meta['image']) { 187 | return meta['image'] 188 | } 189 | return 190 | } 191 | 192 | // A function that retro-engineer the hljs generated html to extract the code 193 | async _extract_code_from_page(page: Page) { 194 | const code = await page.evaluate(() => { 195 | let code = '' 196 | const pre = document.getElementsByTagName('pre') 197 | for (let i = 0; i < pre.length; i++) { 198 | const code_elem = pre[i].getElementsByTagName('code') 199 | if (code_elem.length > 0) { 200 | code += code_elem[0].innerText 201 | } 202 | } 203 | return code 204 | }) 205 | return this._format_code(code) 206 | } 207 | // A function that use prettier to format the code that has been extracted in a html page. 208 | // Format only if the language is supported by prettier 209 | _format_code(code: string) { 210 | let formatted_code = '' 211 | try { 212 | formatted_code = prettier.format(code, { 213 | parser: 'babel', 214 | }) 215 | } catch (e) { 216 | console.log('Error while formatting code', e) 217 | return code 218 | } 219 | return formatted_code 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/scrapers/docssearch.ts: -------------------------------------------------------------------------------- 1 | import { v4 as uuidv4 } from 'uuid' 2 | import { Sender } from '../sender' 3 | import { Config } from '../types' 4 | import { Page } from 'puppeteer' 5 | import { 6 | DocsSearchDocument, 7 | HTag, 8 | HierarchyLevel, 9 | RadioHierarchyLevel, 10 | } from '../types' 11 | 12 | const RADIO_HIERARCHY_LEVELS: Record = { 13 | H1: 'hierarchy_radio_lvl1', 14 | H2: 'hierarchy_radio_lvl2', 15 | H3: 'hierarchy_radio_lvl3', 16 | H4: 'hierarchy_radio_lvl4', 17 | H5: 'hierarchy_radio_lvl5', 18 | } 19 | 20 | const HIERARCHY_LEVELS: Record = { 21 | H1: 'hierarchy_lvl1', 22 | H2: 'hierarchy_lvl2', 23 | H3: 'hierarchy_lvl3', 24 | H4: 'hierarchy_lvl4', 25 | H5: 'hierarchy_lvl5', 26 | } 27 | 28 | const TAG_LEVELS: Record = { 29 | H1: 100, 30 | H2: 90, 31 | H3: 80, 32 | H4: 70, 33 | H5: 60, 34 | } 35 | 36 | export default class DocsearchScaper { 37 | sender: Sender 38 | settings: Config['meilisearch_settings'] 39 | 40 | constructor(sender: Sender, config?: Config) { 41 | console.info('DocsearchScaper::constructor') 42 | this.sender = sender 43 | 44 | // Predefined settings 45 | const defaultSettings = { 46 | distinctAttribute: 'url', 47 | rankingRules: [ 48 | 'words', 49 | 'typo', 50 | 'attribute', 51 | 'proximity', 52 | 'exactness', 53 | 'page_rank:desc', 54 | 'level:desc', 55 | 'position:asc', 56 | ], 57 | searchableAttributes: [ 58 | 'hierarchy_radio_lvl1', 59 | 'hierarchy_radio_lvl2', 60 | 'hierarchy_radio_lvl3', 61 | 'hierarchy_radio_lvl4', 62 | 'hierarchy_radio_lvl5', 63 | 'hierarchy_lvl1', 64 | 'hierarchy_lvl2', 65 | 'hierarchy_lvl3', 66 | 'hierarchy_lvl4', 67 | 'hierarchy_lvl5', 68 | 'hierarchy_radio_lvl0', 69 | 'hierarchy_lvl0', 70 | 'content', 71 | ], 72 | } 73 | 74 | // Merge user-defined settings with predefined settings 75 | this.settings = { 76 | ...defaultSettings, 77 | ...(config?.meilisearch_settings || {}), 78 | } 79 | 80 | void this.sender.updateSettings(this.settings) 81 | } 82 | 83 | _amount_of_hierarchies(pageMap: DocsSearchDocument) { 84 | return Object.keys(pageMap).filter((key) => key.startsWith('hierarchy_lvl')) 85 | .length 86 | } 87 | _is_h_tag(tag: string) { 88 | return tag.startsWith('H') 89 | } 90 | 91 | // Remove all hierarchies that are lower than the current level. 92 | // Considering hierarchy_level_5 is lower than hierarchy_level_4. 93 | _remove_lower_lvl_hierarchies( 94 | pageMap: DocsSearchDocument, 95 | currentLevel: string 96 | ): DocsSearchDocument { 97 | for (const hierarchy in pageMap) { 98 | const levelMatch = hierarchy.match(/\d+/) || [] 99 | const currentLevelMatch = currentLevel.match(/\d+/) || [] 100 | if (levelMatch[0] && currentLevelMatch[0]) { 101 | if (parseInt(levelMatch[0]) > parseInt(currentLevelMatch[0])) { 102 | delete pageMap[hierarchy as keyof DocsSearchDocument] 103 | } 104 | } 105 | } 106 | return pageMap 107 | } 108 | 109 | _empty_radio_lvl_hierarchies( 110 | document: DocsSearchDocument 111 | ): DocsSearchDocument { 112 | return { 113 | ...document, 114 | hierarchy_radio_lvl0: null, 115 | hierarchy_radio_lvl1: null, 116 | hierarchy_radio_lvl2: null, 117 | hierarchy_radio_lvl3: null, 118 | hierarchy_radio_lvl4: null, 119 | hierarchy_radio_lvl5: null, 120 | } 121 | } 122 | 123 | _fill_lvl_fields( 124 | document: DocsSearchDocument, 125 | tag: HTag, 126 | text: string 127 | ): DocsSearchDocument { 128 | return { 129 | ...document, 130 | [HIERARCHY_LEVELS[tag]]: text, 131 | [RADIO_HIERARCHY_LEVELS[tag]]: text, 132 | } 133 | } 134 | 135 | _update_document( 136 | document: DocsSearchDocument, 137 | tag: HTag, 138 | text: string, 139 | anchor?: string 140 | ): DocsSearchDocument { 141 | document = { 142 | ...document, 143 | level: TAG_LEVELS[tag], 144 | } 145 | document = this._empty_radio_lvl_hierarchies(document) 146 | document = this._remove_lower_lvl_hierarchies( 147 | document, 148 | HIERARCHY_LEVELS[tag] 149 | ) 150 | document = this._fill_lvl_fields(document, tag, text) 151 | document['anchor'] = anchor ? `#${anchor}` : '' 152 | return document 153 | } 154 | 155 | async get(url: string, page: Page) { 156 | //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset 157 | // needs to be able to provide the `main` or `article` tag 158 | // TODO: create a configuration to provide the main tag in which the content is 159 | let elems = await page.$$( 160 | 'main h1, main h2, main h3, main h4, main h5, main p, main td, main li, main span' 161 | ) 162 | if (elems.length === 0) { 163 | elems = await page.$$('h1, h2, h3, h4, h5, p, td, li, span') 164 | } 165 | let document = {} as DocsSearchDocument 166 | document = this._empty_radio_lvl_hierarchies(document) 167 | 168 | for (let i = 0; i < elems.length; i++) { 169 | const elem = elems[i] 170 | const tag = await elem.evaluate((el) => el.tagName) 171 | let text = (await elem.evaluate((el) => el.textContent)) || '' 172 | text = this._clean_text(text) 173 | 174 | const urls_tags = new URL(url).pathname.split('/') 175 | const only_urls_tags = urls_tags.slice(1, urls_tags.length - 1) 176 | document['hierarchy_lvl0'] = only_urls_tags.join(' > ') || '' 177 | document['url'] = url 178 | 179 | // Every time a H tag is found, the previous content is indexed and then emptied 180 | if ( 181 | this._is_h_tag(tag) && 182 | this._amount_of_hierarchies(document) > 1 && 183 | document['content'] && 184 | document['content'].length > 0 185 | ) { 186 | await this._send_data({ ...document, type: 'content' }) 187 | document['content'] = [] 188 | } 189 | 190 | const anchor = await elem.evaluate((el) => el.id) 191 | if (tag === 'H1') { 192 | document = Object.assign( 193 | {}, 194 | this._update_document(document, tag, text, anchor) 195 | ) 196 | } else if (tag === 'H2') { 197 | document = Object.assign( 198 | {}, 199 | this._update_document(document, tag, text, anchor) 200 | ) 201 | } else if (tag === 'H3') { 202 | document = Object.assign( 203 | {}, 204 | this._update_document(document, tag, text, anchor) 205 | ) 206 | } else if (tag === 'H4') { 207 | document = Object.assign( 208 | {}, 209 | this._update_document(document, tag, text, anchor) 210 | ) 211 | } else if (tag === 'H5') { 212 | document = Object.assign( 213 | {}, 214 | this._update_document(document, tag, text, anchor) 215 | ) 216 | } else if ( 217 | (tag === 'P' || tag === 'TD' || tag === 'LI' || tag === 'SPAN') && 218 | this._amount_of_hierarchies(document) > 1 219 | ) { 220 | if (!document['content']) { 221 | document['content'] = [] 222 | } 223 | if ( 224 | text !== null && 225 | Array.isArray(document['content']) && 226 | !document['content'].includes(text) 227 | ) { 228 | document['content'].push(text) 229 | } 230 | } 231 | } 232 | // Send remaining data 233 | if (document.content && document.content?.length > 0) { 234 | await this._send_data({ ...document }) 235 | } 236 | } 237 | 238 | async _send_data(data: DocsSearchDocument) { 239 | try { 240 | data.uid = uuidv4() 241 | data.url = data.url + data.anchor 242 | if (Array.isArray(data['content'])) { 243 | data['content'] = data['content'].join('\n') 244 | } else { 245 | data['content'] = '' 246 | } 247 | await this.sender.add(data) 248 | } catch (e) { 249 | console.log('error', e) 250 | } 251 | } 252 | 253 | // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and 254 | // remove '# ' from the beginning of the text 255 | _clean_text(text: string) { 256 | text = text.replace(/[\r\n]+/gm, ' ') 257 | ///remove multiple spaces 258 | text = text.replace(/\s+/g, ' ') 259 | ///remove '# ' 260 | text = text.replace('# ', '') 261 | /// Trim leading and trailing spaces 262 | text = text.replace(/^\s+|\s+$/g, '') 263 | return text 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /src/scrapers/schema.ts: -------------------------------------------------------------------------------- 1 | import { v4 as uuidv4 } from 'uuid' 2 | import { Page } from 'puppeteer' 3 | import { Sender } from '../sender' 4 | import { Config, SchemaDocument } from '../types' 5 | 6 | export default class SchemaScaper { 7 | sender: Sender 8 | config: Config 9 | settings_sent: boolean 10 | 11 | constructor(sender: Sender, config: Config) { 12 | console.info('SchemaScaper::constructor') 13 | this.sender = sender 14 | this.config = config 15 | this.settings_sent = false 16 | 17 | if (this.config.meilisearch_settings) { 18 | void this.sender.updateSettings(this.config.meilisearch_settings) 19 | this.settings_sent = true 20 | } 21 | } 22 | 23 | async get(url: string, page: Page) { 24 | console.log('__extractContent', url) 25 | // Get the schema.org data 26 | const data = (await page.evaluate((): Record => { 27 | const schema = document.querySelector( 28 | "script[type='application/ld+json']" 29 | ) 30 | if (schema) { 31 | return JSON.parse(schema.innerText) as Record 32 | } 33 | return {} // TODO: raise error 34 | })) as SchemaDocument 35 | 36 | // TODO: use zod here instead of forcing `as SchemaDocument`? 37 | 38 | if (data.length === 0) return 39 | 40 | if (this.config.schema_settings?.only_type) { 41 | if (data['@type'] !== this.config.schema_settings?.only_type) return 42 | } 43 | 44 | this._clean_schema(data) 45 | 46 | if (this.config.schema_settings?.convert_dates) { 47 | // convert dates to timestamps 48 | Object.keys(data).forEach((key) => { 49 | if (typeof data[key] === 'string') { 50 | // check if it is a date 51 | if (Date.parse(data[key])) { 52 | data[key] = Date.parse(data[key]) 53 | } 54 | } 55 | }) 56 | } 57 | 58 | if (data['@graph']) { 59 | for (const graph of data['@graph']) { 60 | graph.uid = uuidv4() 61 | await this.sender.add(graph) 62 | } 63 | } else { 64 | data.uid = uuidv4() 65 | await this.sender.add(data) 66 | } 67 | } 68 | 69 | _clean_schema(data: SchemaDocument) { 70 | if (data['@context']) { 71 | delete data['@context'] 72 | } 73 | if (data['@type']) { 74 | delete data['@type'] 75 | } 76 | Object.keys(data).forEach((key) => { 77 | if (typeof data[key] === 'object') { 78 | this._clean_schema(data[key]) 79 | } 80 | }) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/sender.ts: -------------------------------------------------------------------------------- 1 | import { MeiliSearch, Settings } from 'meilisearch' 2 | import { Config, DocumentType } from './types' 3 | import { initMeilisearchClient } from './meilisearch_client' 4 | import { Webhook } from './webhook' 5 | 6 | //Create a class called Sender that will queue the json data and batch it to a Meilisearch instance 7 | export class Sender { 8 | config: Config 9 | queue: DocumentType[] = [] 10 | initial_index_uid: string 11 | index_uid: string 12 | batch_size: number 13 | client: MeiliSearch 14 | timeout: number 15 | nb_documents_sent = 0 16 | 17 | constructor(config: Config) { 18 | console.info('Sender::constructor') 19 | this.config = config 20 | this.initial_index_uid = config.meilisearch_index_uid 21 | this.index_uid = this.initial_index_uid 22 | this.batch_size = config.batch_size || 1000 23 | this.timeout = config.timeout || 100000 24 | 25 | //Create a Meilisearch client 26 | this.client = initMeilisearchClient({ 27 | host: config.meilisearch_url, 28 | apiKey: config.meilisearch_api_key, 29 | clientAgents: config.user_agents, 30 | }) 31 | } 32 | 33 | async init() { 34 | console.log('Sender::init') 35 | try { 36 | await Webhook.get(this.config).started(this.config) 37 | const index = await this.client.getIndex(this.initial_index_uid) 38 | 39 | if (index) { 40 | this.index_uid = this.initial_index_uid + '_crawler_tmp' 41 | 42 | const tmp_index = await this.client.getIndex(this.index_uid) 43 | if (tmp_index) { 44 | const task = await this.client.deleteIndex(this.index_uid) 45 | await this.client.waitForTask(task.taskUid) 46 | } 47 | } 48 | 49 | await this.client.createIndex(this.index_uid, { 50 | primaryKey: this.config.primary_key || 'uid', 51 | }) 52 | } catch (e) { 53 | console.log('try to delete a tmp index if it exists') 54 | } 55 | } 56 | 57 | //Add a json object to the queue 58 | async add(data: DocumentType) { 59 | this.nb_documents_sent++ 60 | 61 | if (this.config.primary_key && this.config.primary_key !== 'uid') { 62 | delete data['uid'] 63 | } 64 | 65 | if (this.batch_size) { 66 | this.queue.push(data) 67 | if (this.queue.length >= this.batch_size) { 68 | this.__batchSend() 69 | this.queue = [] 70 | } 71 | } else { 72 | await this.client.index(this.index_uid).addDocuments([data]) 73 | } 74 | } 75 | 76 | async updateSettings(settings: Settings) { 77 | console.log('Sender::updateSettings') 78 | const task = await this.client 79 | .index(this.index_uid) 80 | .updateSettings(settings) 81 | await this.client.waitForTask(task.taskUid) 82 | } 83 | 84 | async finish() { 85 | await this.__batchSendSync() 86 | const index = await this.client.getIndex(this.index_uid) 87 | const stats = await index.getStats() 88 | if ( 89 | this.index_uid !== this.initial_index_uid && 90 | stats.numberOfDocuments > 0 91 | ) { 92 | await this.__swapIndex() 93 | } else if (this.index_uid !== this.initial_index_uid) { 94 | const task = await this.client.deleteIndex(this.index_uid) 95 | await this.client.index(this.index_uid).waitForTask(task.taskUid) 96 | } 97 | 98 | await Webhook.get(this.config).completed( 99 | this.config, 100 | this.nb_documents_sent 101 | ) 102 | console.log('Sender::Finish') 103 | } 104 | 105 | __batchSend() { 106 | console.log(`Sender::__batchSend - size: ${this.queue.length}`) 107 | this.client 108 | .index(this.index_uid) 109 | .addDocuments(this.queue) 110 | .catch((e) => { 111 | console.log(e) 112 | console.log('Error while sending data to MeiliSearch') 113 | }) 114 | } 115 | 116 | async __batchSendSync() { 117 | console.log(`Sender::__batchSend - size: ${this.queue.length}`) 118 | const task = await this.client 119 | .index(this.index_uid) 120 | .addDocuments(this.queue) 121 | await this.client.waitForTask(task.taskUid, { timeOutMs: this.timeout }) 122 | } 123 | 124 | async __swapIndex() { 125 | console.log('Sender::__swapIndex') 126 | await this.client.swapIndexes([ 127 | { indexes: [this.initial_index_uid, this.index_uid] }, 128 | ]) 129 | const task = await this.client.deleteIndex(this.index_uid) 130 | await this.client.index(this.index_uid).waitForTask(task.taskUid) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/server.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from 'dotenv' 2 | dotenv.config() 3 | 4 | import express from 'express' 5 | import { TaskQueue } from './taskQueue' 6 | import { Sender } from './sender' 7 | import { Crawler } from './crawler' 8 | 9 | const port = process.env.PORT || 8080 10 | 11 | class Server { 12 | taskQueue: TaskQueue 13 | app: express.Application 14 | 15 | constructor() { 16 | this.__check_env() 17 | 18 | this.taskQueue = new TaskQueue() 19 | this.app = express() 20 | this.app.use(express.json()) 21 | this.app.post('/crawl', this.__asyncCrawl.bind(this)) 22 | this.app.post('/crawl/async', this.__asyncCrawl.bind(this)) 23 | this.app.post('/crawl/sync', this.__syncCrawl.bind(this)) 24 | this.app.post('/crawl/start', this.__startCrawl.bind(this)) 25 | this.app.post('/webhook', this.__log_webhook.bind(this)) 26 | 27 | this.app.listen(port, () => 28 | console.log(`Crawler app listening on port ${port}!`) 29 | ) 30 | } 31 | 32 | __check_env() { 33 | const { REDIS_URL, WEBHOOK_URL, WEBHOOK_TOKEN, WEBHOOK_INTERVAL } = 34 | process.env 35 | 36 | console.log('REDIS_URL: ', REDIS_URL) 37 | console.log('WEBHOOK_URL: ', WEBHOOK_URL) 38 | console.log('WEBHOOK_TOKEN: ', WEBHOOK_TOKEN) 39 | console.log('WEBHOOK_INTERVAL: ', WEBHOOK_INTERVAL) 40 | } 41 | 42 | __asyncCrawl(req: express.Request, res: express.Response) { 43 | this.taskQueue.add(req.body) 44 | console.log('Crawling started') 45 | res.send('Crawling started') 46 | } 47 | 48 | async __syncCrawl(req: express.Request, res: express.Response) { 49 | const sender = new Sender(req.body) 50 | await sender.init() 51 | 52 | const crawler = new Crawler(sender, req.body) 53 | 54 | await crawler.run() 55 | await sender.finish() 56 | 57 | res.send('Crawling finished') 58 | } 59 | 60 | async __startCrawl(req: express.Request, res: express.Response) { 61 | console.log('Crawling started') 62 | res.send('Crawling started') 63 | 64 | const sender = new Sender(req.body) 65 | await sender.init() 66 | 67 | const crawler = new Crawler(sender, req.body) 68 | 69 | await crawler.run() 70 | await sender.finish() 71 | } 72 | 73 | __log_webhook(req: express.Request, res: express.Response) { 74 | console.log('webhook received: ', req.body) 75 | res.send('ok') 76 | } 77 | } 78 | 79 | new Server() 80 | -------------------------------------------------------------------------------- /src/taskQueue.ts: -------------------------------------------------------------------------------- 1 | import Queue, { Job, DoneCallback } from 'bull' 2 | import { initMeilisearchClient } from './meilisearch_client' 3 | import { fork } from 'child_process' 4 | import { Config } from './types' 5 | 6 | export class TaskQueue { 7 | queue: Queue.Queue 8 | 9 | constructor() { 10 | console.info('TaskQueue::constructor') 11 | if (process.env.REDIS_URL) { 12 | this.queue = new Queue('crawling', process.env.REDIS_URL) 13 | } else { 14 | this.queue = new Queue('crawling') 15 | } 16 | void this.queue.process(this.__process.bind(this)) 17 | this.queue.on('added', this.__jobAdded.bind(this)) 18 | this.queue.on('completed', this.__jobCompleted.bind(this)) 19 | this.queue.on('failed', this.__jobFailed.bind(this)) 20 | this.queue.on('active', this.__jobActive.bind(this)) 21 | this.queue.on('wait', this.__jobWaiting.bind(this)) 22 | this.queue.on('delayed', this.__jobDelayed.bind(this)) 23 | } 24 | 25 | add(data: Config) { 26 | void this.queue.add(data) 27 | } 28 | 29 | __process(job: Job, done: DoneCallback) { 30 | console.log('Job process', job.id) 31 | const childProcess = fork('./dist/src/crawler_process.js') 32 | childProcess.send(job.data) 33 | childProcess.on('message', (message) => { 34 | console.log(message) 35 | done() 36 | }) 37 | } 38 | 39 | __jobAdded(job: Job) { 40 | console.log('Job added', job.id) 41 | } 42 | 43 | __jobCompleted(job: Job) { 44 | console.log('Job completed', job.id) 45 | } 46 | 47 | async __jobFailed(job: Job) { 48 | console.log('Job failed', job.id) 49 | //Create a Meilisearch client 50 | const client = initMeilisearchClient({ 51 | host: job.data.meilisearch_url, 52 | apiKey: job.data.meilisearch_api_key, 53 | clientAgents: job.data.user_agents, 54 | }) 55 | 56 | //check if the tmp index exists 57 | const tmp_index_uid = job.data.meilisearch_index_uid + '_crawler_tmp' 58 | try { 59 | const index = await client.getIndex(tmp_index_uid) 60 | if (index) { 61 | const task = await client.deleteIndex(tmp_index_uid) 62 | await client.waitForTask(task.taskUid) 63 | } 64 | } catch (e) { 65 | console.error(e) 66 | } 67 | } 68 | 69 | __jobActive(job: Job) { 70 | console.log({ job }) 71 | console.log('Job active', job.id) 72 | } 73 | 74 | __jobWaiting(job: Job) { 75 | console.log('Job waiting', job.id) 76 | } 77 | 78 | __jobDelayed(job: Job) { 79 | console.log('Job delayed', job.id) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import { Settings } from 'meilisearch' 2 | import DocsearchScraper from './scrapers/docssearch' 3 | import DefaultScraper from './scrapers/default' 4 | import SchemaScraper from './scrapers/schema' 5 | 6 | export type Config = { 7 | meilisearch_index_uid: string 8 | meilisearch_url: string 9 | meilisearch_api_key: string 10 | start_urls: string[] 11 | urls_to_exclude?: string[] 12 | additional_request_headers?: Record 13 | queue?: string[] 14 | primary_key?: string 15 | batch_size?: number 16 | meilisearch_settings?: Settings 17 | strategy?: 'docssearch' | 'default' | 'schema' 18 | headless?: 'new' | 'old' | boolean 19 | urls_to_index?: string[] // Overwrites start_urls if present 20 | urls_to_not_index?: string[] 21 | schema_settings?: SchemaSettings 22 | user_agents?: string[] 23 | webhook_payload?: Record 24 | webhook_url?: string 25 | timeout?: number 26 | } 27 | 28 | export type SchemaSettings = { 29 | convert_dates: boolean 30 | only_type: string 31 | } 32 | 33 | export type Scraper = DocsearchScraper | DefaultScraper | SchemaScraper 34 | 35 | export type DocumentType = DocsSearchDocument | DefaultDocument | SchemaDocument 36 | 37 | export type HierarchyLevel = { 38 | hierarchy_lvl0: string | null 39 | hierarchy_lvl1: string | null 40 | hierarchy_lvl2: string | null 41 | hierarchy_lvl3: string | null 42 | hierarchy_lvl4: string | null 43 | hierarchy_lvl5: string | null 44 | } 45 | 46 | export type RadioHierarchyLevel = { 47 | hierarchy_radio_lvl0: string | null 48 | hierarchy_radio_lvl1: string | null 49 | hierarchy_radio_lvl2: string | null 50 | hierarchy_radio_lvl3: string | null 51 | hierarchy_radio_lvl4: string | null 52 | hierarchy_radio_lvl5: string | null 53 | } 54 | 55 | export type HTag = 'H1' | 'H2' | 'H3' | 'H4' | 'H5' 56 | 57 | export type DocsSearchDocument = HierarchyLevel & 58 | RadioHierarchyLevel & { 59 | url: string 60 | uid?: string 61 | anchor: string 62 | content?: string[] | string 63 | level: number 64 | type: 'lvl0' | 'lvl1' | 'lvl2' | 'lvl3' | 'lvl4' | 'lvl5' | 'content' 65 | } 66 | 67 | export type DefaultDocument = { 68 | url: string 69 | uid?: string 70 | anchor: string 71 | title: string 72 | meta: Meta 73 | image_url?: string 74 | page_block: number 75 | urls_tags: string[] 76 | h1?: string | null 77 | h2?: string | null 78 | h3?: string | null 79 | h4?: string | null 80 | h5?: string | null 81 | h6?: string | null 82 | p: string[] | string 83 | } 84 | 85 | export type SchemaDocument = { 86 | uid: string 87 | [key: string]: any 88 | } 89 | 90 | export type Meta = { 91 | [name: string]: string 92 | } 93 | -------------------------------------------------------------------------------- /src/webhook.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosResponse } from 'axios' 2 | import { Config } from './types' 3 | 4 | // This webhook sender is a singleton 5 | export class Webhook { 6 | private static instance: Webhook 7 | private webhook_url: string | undefined 8 | 9 | configured = false 10 | 11 | constructor(config: Config) { 12 | console.info('Webhook::constructor') 13 | if (config.webhook_url || process.env.WEBHOOK_URL) { 14 | this.configured = true 15 | this.webhook_url = config.webhook_url || process.env.WEBHOOK_URL 16 | } else { 17 | console.warn( 18 | 'Webhook not configured; if you want to use a webhook, set the WEBHOOK_URL environment variable or provide the webhook_url option in the config' 19 | ) 20 | } 21 | } 22 | 23 | public static get(config: Config): Webhook { 24 | if (!Webhook.instance) { 25 | Webhook.instance = new Webhook(config) 26 | } 27 | return Webhook.instance 28 | } 29 | 30 | async started(config: Config) { 31 | if (!this.configured) return 32 | await this.__callWebhook(config, { status: 'started' }) 33 | } 34 | 35 | async active(config: Config, data: Record) { 36 | if (!this.configured) return 37 | await this.__callWebhook(config, { status: 'active', ...data }) 38 | } 39 | 40 | async paused(config: Config) { 41 | if (!this.configured) return 42 | await this.__callWebhook(config, { status: 'paused' }) 43 | } 44 | 45 | async completed(config: Config, nbDocumentsSent: number) { 46 | if (!this.configured) return 47 | await this.__callWebhook(config, { 48 | status: 'completed', 49 | nb_documents_sent: nbDocumentsSent, 50 | }) 51 | } 52 | 53 | async failed(config: Config, error: Error) { 54 | if (!this.configured) return 55 | await this.__callWebhook(config, { status: 'failed', error: error.message }) 56 | } 57 | 58 | async __callWebhook(config: Config, data: any) { 59 | if (!this.webhook_url) return 60 | try { 61 | data.meilisearch_url = config.meilisearch_url 62 | data.meilisearch_index_uid = config.meilisearch_index_uid 63 | 64 | if (config.webhook_payload) { 65 | data.webhook_payload = config.webhook_payload 66 | } 67 | 68 | const date = new Date() 69 | data.date = date.toISOString() 70 | 71 | const headers: Record = { 72 | 'Content-Type': 'application/json', 73 | } 74 | 75 | if (process.env.WEBHOOK_TOKEN) { 76 | headers['Authorization'] = `Bearer ${process.env.WEBHOOK_TOKEN}` 77 | } 78 | 79 | const response: AxiosResponse = await axios.post(this.webhook_url, data, { 80 | headers: headers, 81 | }) 82 | if (response.status == 401 || response.status == 403) { 83 | this.configured = false 84 | } 85 | } catch (error) { 86 | console.error('Error calling webhook:', error) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /tsconfig.eslint.json: -------------------------------------------------------------------------------- 1 | { 2 | // extend your base config so you don't have to redefine your compilerOptions 3 | "extends": "./tsconfig.json", 4 | "compilerOptions": { 5 | "allowJs": false, 6 | "module": "esnext", 7 | }, 8 | "include": [ 9 | "src/**/*.ts", 10 | "playground/**/*.tsx", 11 | ".eslintrc.cjs" 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "CommonJS", 5 | "target": "ES2022", 6 | "outDir": "./dist/src", 7 | "allowJs": true, 8 | "strict": true, 9 | "esModuleInterop": true 10 | }, 11 | "include": ["src/**/*"], 12 | "exclude": ["node_modules"] 13 | } 14 | --------------------------------------------------------------------------------