├── .editorconfig ├── .github └── workflows │ └── sync-to-astro.yml ├── .gitignore ├── LICENSE.md ├── README.md ├── img ├── apify-actor-drawing.png ├── apify-store.png └── screenshot-taker-input.png ├── package-lock.json ├── package.json ├── pages ├── ACTOR_FILE.md ├── DATASET_SCHEMA.md ├── IDEAS.md ├── INPUT_SCHEMA.md ├── KEY_VALUE_STORE_SCHEMA.md ├── OUTPUT_SCHEMA.md ├── REQUEST_QUEUE_SCHEMA.md └── tmp_schema_experiments │ ├── amazon_scraper │ └── .actor │ │ ├── ACTOR.json │ │ ├── INPUT_SCHEMA.json │ │ └── OUTPUT_SCHEMA.json │ ├── dataset-viewer │ └── .actor │ │ └── OUTPUT_SCHEMA.json │ └── google_search_scraper │ └── .ACTOR │ ├── ACTOR.json │ ├── INPUT_SCHEMA.json │ ├── OUTPUT.json │ ├── OUTPUT_SCHEMA.json │ └── schemas │ ├── GOOGLE_SERPS_DATASET_SCHEMA.json │ └── GOOGLE_SERPS_SCREENSHOTS_KV_STORE_SCHEMA.json ├── requirements.txt ├── scripts ├── md2mdx.py ├── setup.sh └── test-sync.sh └── sync-pr-template.txt /.editorconfig: -------------------------------------------------------------------------------- 1 | [*.md] 2 | indent_size = 2 3 | indent_style = space 4 | -------------------------------------------------------------------------------- /.github/workflows/sync-to-astro.yml: -------------------------------------------------------------------------------- 1 | name: Sync Whitepaper to Astro (PR flow) 2 | 3 | # Triggers on push to main when MD files or related assets change. 4 | on: 5 | push: 6 | branches: 7 | - master 8 | paths: 9 | - '*.md' 10 | - 'pages/**/*.md' 11 | - '.github/workflows/sync-to-astro.yml' 12 | - 'scripts/**/*.py' 13 | workflow_dispatch: # also allows manual trigger from GitHub UI 14 | 15 | env: 16 | TARGET_REPO: "apify/actor-whitepaper-web" 17 | TARGET_BRANCH: "sync/whitepaper-updates" 18 | 19 | jobs: 20 | sync: 21 | name: Sync Whitepaper to Astro (PR flow) 22 | runs-on: ubuntu-latest 23 | permissions: 24 | contents: write # needed for pushing changes 25 | pull-requests: write # needed for creating PRs 26 | 27 | steps: 28 | # Step 1: Clone the source repo (this repo). 29 | - name: Checkout source repo 30 | uses: actions/checkout@v4 31 | with: 32 | path: sync/source 33 | 34 | # Step 2: Clone the target repo (Astro site). 35 | - name: Checkout target repo 36 | uses: actions/checkout@v4 37 | with: 38 | repository: ${{ env.TARGET_REPO }} 39 | path: sync/target 40 | token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} 41 | 42 | # Step 3: Setup Python environment. 43 | - name: Setup Python 44 | uses: actions/setup-python@v4 45 | with: 46 | python-version: '3.11' 47 | cache: 'pip' 48 | cache-dependency-path: sync/source/requirements.txt 49 | 50 | # Step 4: Install dependencies. 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install -r sync/source/requirements.txt 55 | cp sync/source/package.json . && npm install 56 | 57 | # Step 5: Run the MD to MDX conversion script. 58 | - name: Run sync script 59 | run: python sync/source/scripts/md2mdx.py --source sync/source --target sync/target 60 | 61 | # Step 6: Create or update PR with changes. 62 | - name: Create Pull Request 63 | env: 64 | GH_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} 65 | run: | 66 | cd sync/target 67 | git status 68 | 69 | # Create a unique branch name with timestamp. 70 | BRANCH_NAME="sync/whitepaper-updates-$(date +%Y%m%d-%H%M%S)" 71 | echo "Using branch: $BRANCH_NAME" 72 | 73 | git config user.name "github-actions[bot]" 74 | git config user.email "github-actions[bot]@users.noreply.github.com" 75 | 76 | git checkout -b "$BRANCH_NAME" 77 | 78 | # Only create PR if there are changes. 79 | if [[ -n "$(git status --porcelain)" ]]; then 80 | echo "Changes detected:" 81 | git status --porcelain 82 | 83 | git add . 84 | git commit -m "sync: Update MDX content from Whitepaper" 85 | if ! git push -f origin "$BRANCH_NAME"; then 86 | echo "Failed to push changes" 87 | exit 1 88 | fi 89 | 90 | # Create the PR using GitHub CLI. 91 | gh pr create \ 92 | --title "sync: Update MDX content from Whitepaper" \ 93 | --body-file ../source/sync-pr-template.txt \ 94 | --base main \ 95 | --head "$BRANCH_NAME" \ 96 | --label "sync" \ 97 | --assignee ${{ github.actor }} 98 | else 99 | echo "No changes detected in git status --porcelain" 100 | echo "Full directory contents of src/content/pages:" 101 | ls -la src/content/pages/ 102 | fi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | node_modules 3 | sync 4 | 5 | # Python 6 | .venv 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | .Python 11 | pip-log.txt 12 | pip-delete-this-directory.txt 13 | .pytest_cache/ 14 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2024 Apify Technologies s.r.o. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /img/apify-actor-drawing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/apify-actor-drawing.png -------------------------------------------------------------------------------- /img/apify-store.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/apify-store.png -------------------------------------------------------------------------------- /img/screenshot-taker-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/screenshot-taker-input.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "actor-specs", 3 | "version": "0.0.1", 4 | "description": "This is just to generate table of content in Markdown files", 5 | "devDependencies": { 6 | "markdown-link-check": "^3.13.6", 7 | "markdown-toc": "^1.2.0", 8 | "prettier": "^3.5.1", 9 | "prettier-plugin-astro": "^0.14.1", 10 | "prettier-plugin-astro-organize-imports": "^0.4.11", 11 | "prettier-plugin-css-order": "^2.1.2", 12 | "prettier-plugin-jsdoc": "^1.3.2", 13 | "prettier-plugin-organize-attributes": "^1.0.0", 14 | "prettier-plugin-organize-imports": "^4.1.0", 15 | "prettier-plugin-tailwindcss": "^0.6.11" 16 | }, 17 | "scripts": { 18 | "build-toc": "./node_modules/.bin/markdown-toc README.md -i --maxdepth 3 && ./node_modules/.bin/markdown-link-check README.md", 19 | "test-sync": "chmod +x scripts/*.sh && ./scripts/test-sync.sh", 20 | "format-sync": "prettier --write --config sync/target/.prettierrc.cjs --ignore-path false --plugin=prettier-plugin-astro --plugin=prettier-plugin-organize-imports --plugin=prettier-plugin-organize-attributes --plugin=prettier-plugin-astro-organize-imports --plugin=prettier-plugin-css-order --plugin=prettier-plugin-tailwindcss --plugin=prettier-plugin-jsdoc \"./sync/target/src/content/pages/**/*.mdx\"" 21 | }, 22 | "repository": { 23 | "type": "git", 24 | "url": "https://github.com/apifytech/actor-scraper" 25 | }, 26 | "author": { 27 | "name": "Jan Curn", 28 | "email": "jan@apify.com", 29 | "url": "https://apify.com/jancurn" 30 | }, 31 | "license": "Apache-2.0" 32 | } 33 | -------------------------------------------------------------------------------- /pages/ACTOR_FILE.md: -------------------------------------------------------------------------------- 1 | # Actor file specification 2 | 3 | This JSON file must be present at `.actor/actor.json` and defines core properties of a single web Actor. 4 | 5 | The file contains a single JSON object with the following properties: 6 | 7 | ```jsonc 8 | { 9 | // Required field, indicates that this is an Actor definition file and the specific version of the Actor specification. 10 | "actorSpecification": 1, 11 | 12 | // Required "technical" name of the Actor, must be a DNS hostname-friendly text. 13 | "name": "google-search-scraper", 14 | 15 | // Human-friendly name and description of the Actor. 16 | "title": "Google Search Scraper", 17 | "description": "A 200-char description", 18 | 19 | // Required, indicates the version of the Actor. Since actor.json file is commited to Git, you can have different Actor 20 | // versions in different branches. 21 | "version": "0.0", 22 | 23 | // Optional tag that is applied to the builds of this Actor. If omitted, it defaults to "latest". 24 | "buildTag": "latest", 25 | 26 | // An optional object with environment variables expected by the Actor. 27 | // Secret values are prefixed by @ and their actual values need to be registered with the CLI, for example: 28 | // $ apify secrets add mySecretPassword pwd1234 29 | "environmentVariables": { 30 | "MYSQL_USER": "my_username", 31 | "MYSQL_PASSWORD": "@mySecretPassword" 32 | }, 33 | 34 | // Optional field. If true, the Actor indicates it can be run in the Standby mode, 35 | // to get started and be kept alive by the system to handle incoming HTTP REST requests by the Actor's web server. 36 | "usesStandbyMode": true, 37 | 38 | // An optional metadata object enabling implementations to pass arbitrary additional properties. 39 | // The properties and their values must be strings. 40 | "labels": { 41 | "something": "bla bla" 42 | }, 43 | 44 | // Optional minimum and maximum memory for running the Actor. 45 | "minMemoryMbytes": 128, 46 | "maxMemoryMbytes": 4096, 47 | 48 | // Optional link to the Actor Dockerfile. 49 | // If omitted, the system looks for "./Dockerfile" or "../Dockerfile" 50 | "dockerfile": "./Dockerfile", 51 | 52 | // Optional link to the Actor README file in Markdown format. 53 | // If omitted, the system looks for "./ACTOR.md" and "../README.md" 54 | "readme": "./README.md", 55 | 56 | // Optional link to the Actor changelog file in Markdown format. 57 | "changelog": "../../../shared/CHANGELOG.md", 58 | 59 | // Optional link to Actor input or output schema file, or inlined schema object, 60 | // which is a JSON schema with our extensions. For details see ./INPUT_SCHEMA.md or ./OUTPUT_SCHEMA.md, respectively. 61 | // BACKWARDS COMPATIBILITY: "inputSchema" used to be called "input", all implementations should support this. 62 | "inputSchema": "./input_schema.json", 63 | "outputSchema": "./output_schema.json", 64 | 65 | // Optional path to Dataset or Key-value Store schema file or inlined schema object for the Actor's default dataset or key-value store. 66 | // For detail, see ./DATASET_SCHEMA.md or ./KEY_VALUE_STORE_SCHEMA.md, respectively. 67 | // BACKWARDS COMPATIBILITY: "datasetSchema" used to be "storages.keyValueStore" sub-object, all implementations should support this. 68 | "datasetSchema": "../shared_schemas/generic_dataset_schema.json", 69 | "keyValueStoreSchema": "./key_value_store_schema.json", 70 | 71 | // Optional path or inlined schema object of the Actor's web server in OpenAPI formation. 72 | "webServerSchema": "./web_server_openapi.json", 73 | 74 | // Optional URL path and query parameters to the Model Context Protocol (MCP) server exposed by the Actor web server. 75 | // If present, the system knows the Actor provides an MCP server, which can be used by the platform 76 | // and integrations to integrate the Actor with various AI/LLM systems. 77 | "webServerMcpPath": "/mcp?version=2", 78 | 79 | // Scripts can be used by tools like the CLI to do certain actions based on the commands you run. 80 | // The presence of this object in your Actor config is optional, but we recommend always defining at least the `run` key. 81 | "scripts": { 82 | // The `run` script is special - it defines *the* way to run your Actor locally. While tools can decide 83 | // to implement mechanisms to detect what type of project your Actor is, and how to run it, you can choose to 84 | // define this as the source of truth. 85 | // 86 | // This should be the same command you run as if you were at the root of your Actor when you start it locally. 87 | // This can be anything from an npm script, as shown below, to a full chain of commands (ex.: `cargo test && cargo run --release`). 88 | // 89 | // CLIs may opt to also request this command when initializing a new Actor, or to automatically migrate and add it in the first time 90 | // you start the Actor locally. 91 | "run": "npm start" 92 | } 93 | } 94 | ``` 95 | 96 | ## Notes 97 | 98 | - The `name` doesn't contain the developer username, so that the Actor can be easily deployed 99 | to any user account. This is useful for tutorials and examples, as well as 100 | pull requests done externally to create Actors from existing source code files 101 | owned by external developers 102 | (the developer might not have Apify account yet, and we might want to show them deployment 103 | to some testing account). 104 | Note that `apify push` has option `--target=eva/my-actor:0.0` that allows 105 | deployment of the Actor under a different user account, using permissions 106 | and personal API token of the current user. 107 | We should also add options to override only parts of this, 108 | like `--target-user` (ID or username), `--name`, `--build-tag` and `--version`, 109 | it would be useful e.g. in CI for beta versions etc. 110 | - Note that `version` and `buildTag` are shared across Actor deployments to 111 | all user accounts, similarly as with software libraries, 112 | and hence they are part of `actor.json`. 113 | - The `dockerfile` property points to a Dockerfile that is to be used to build the 114 | Actor image. If not present, the system looks for Dockerfile in the `.actor` directory 115 | and if not found, then in Actor's top-level 116 | directory. This setting is useful if the source code repository has some 117 | other Dockerfile in the top-level directory, to separate Actor Docker image from the 118 | other one. Note that paths in Dockerfile are ALWAYS relative to the Dockerfile's location. 119 | When calling `apify run`, the system runs the Actor using the Dockerfile. 120 | - When calling `actor push` and the `title` or `description` are already set 121 | on the Actor (maybe SEO-optimized versions from copywriter), 122 | by default we do not overwrite them 123 | unless `apify push` is called with options `--force-title` or `--force-description`. 124 | 125 | ## Changes from the legacy `apify.json` file 126 | 127 | The `.actor/actor.json` replaces the legacy `apify.json` file. Here are main changes from the previous version: 128 | 129 | - We removed the `template` property as it's not needed for anything, it only stored the original template 130 | - There's a new `title` field for a human-readable name of the Actor. 131 | We're moving towards having human-readable names shown for Actors everywhere, 132 | so it makes sense to define `title` directly in the source code. 133 | - Similarly, we added `description` for the short description of what the Actor does. 134 | - `env` was renamed to `environmentVariables` for more clarity. `apify build` or `apify run` 135 | could have an option `--apply-env-vars-to-build` like we have it on platform. 136 | - The `dockerfile` and `readme` directives are optional, the system falls back to reasonable 137 | defaults, first in `.actor` directory and then in the top-level directory. 138 | - `scripts` section was added 139 | -------------------------------------------------------------------------------- /pages/DATASET_SCHEMA.md: -------------------------------------------------------------------------------- 1 | # Dataset schema file specification 1.0 2 | 3 | Dataset storage enables you to sequentially store and retrieve data records, in various formats. 4 | Each Actor run is assigned its own dataset, which is created when the first item is stored to it. 5 | Datasets usually contain results from web scraping, crawling or data processing jobs. 6 | The data can be visualized as a table where each object is a row and its attributes are the columns. 7 | The data can be exported in JSON, CSV, XML, RSS, Excel, or HTML formats. 8 | 9 | The specification is also at https://docs.apify.com/platform/actors/development/actor-definition/output-schema 10 | 11 | Dataset can be assigned a schema which describes: 12 | 13 | - Content of the dataset, i.e., the schema of objects that are allowed to be added 14 | - Different views on how we can look at the data, aka transformations 15 | - Visualization of the View using predefined components (grid, table, ...), which improves the run view interface at Apify Console 16 | and also provides a better interface for datasets shared by Apify users 17 | 18 | 19 | 20 | 21 | 22 | ## Basic properties 23 | 24 | - Storage is immutable. I.e., if you want to change the structure, then you need to create a new dataset. 25 | - Its schema is weak. I.e., you can always push their additional properties, but schema will ensure that all the listed once are there with a correct type. This is to make Actors more compatible, i.e., some Actor expects dataset to contain certain fields but does not care about the additional ones. 26 | 27 | There are two ways how to create a dataset with schema: 28 | 1. User can start the Actor that has dataset schema linked from its 29 | [OUTPUT_SCHEMA.json](./OUTPUT_SCHEMA.md) 30 | 2. Or user can do it pragmatically via API (for empty dataset) by 31 | - either by passing the schema as payload to [create dataset](https://docs.apify.com/api#/reference/datasets/dataset-collection/create-dataset) API endpoint. 32 | - or using the SDK: 33 | 34 | ```js 35 | const dataset = await Apify.openDataset('my-new-dataset', { schema }); 36 | ``` 37 | 38 | By opening an **existing** dataset with `schema` parameter, the system ensures that you are opening a dataset that is compatible with the Actor as otherwise, you get an error: 39 | 40 | ``` 41 | Uncaught Error: Dataset schema is not compatible with the provided schema 42 | ``` 43 | 44 | ## Structure 45 | 46 | ```jsonc 47 | { 48 | "actorDatasetSchemaVersion": 1, 49 | "title": "E-shop products", 50 | "description": "Dataset containing the whole product catalog including prices and stock availability.", 51 | 52 | // A JSON schema object describing the dataset fields, with our extensions: the "title", "description", and "example" properties. 53 | // "example" is used to generate code and API examples for the Actor output. 54 | // For details, see https://docs.apify.com/platform/actors/development/actor-definition/dataset-schema 55 | "fields": { 56 | "type": "object", 57 | "properties": { 58 | "title": { 59 | "type": "string", 60 | "description": "The name of the results", 61 | }, 62 | "imageUrl": { 63 | "type": "string", 64 | "description": "Function executed for each request", 65 | }, 66 | "priceUsd": { 67 | "type": "integer", 68 | "description": "Price of the item", 69 | }, 70 | "manufacturer": { 71 | "type": "object", 72 | "properties": { 73 | "title": { ... }, 74 | "url": { ... }, 75 | } 76 | }, 77 | ... 78 | }, 79 | "required": ["title"], 80 | }, 81 | 82 | // Define the ways how to present the Dataset to users 83 | "views": { 84 | "overview": { 85 | "title": "Products overview", 86 | "description": "Displays only basic fields such as title and price", 87 | "transformation": { 88 | "flatten": ["stockInfo"], 89 | "fields": [ 90 | "title", 91 | "imageUrl", 92 | "variants" 93 | ] 94 | }, 95 | "display": { 96 | "component": "table", 97 | "properties": { 98 | "title": { 99 | "label": "Title" 100 | }, 101 | "imageUrl": { 102 | "label": "Image", 103 | "format": "image" // optional, in this case the format is overridden to show "image" instead of image link "text". "image" format only works with .jpeg, .png or other image format urls. 104 | }, 105 | "stockInfo.availability": { 106 | "label": "Availability" 107 | } 108 | } 109 | } 110 | }, 111 | "productVariants": { 112 | "title": "Product variants", 113 | "description": "Each product expanded into item per variant", 114 | "transformation": { 115 | "fields": [ 116 | "title", 117 | "price", 118 | "productVariants" 119 | ], 120 | "unwind": "productVariants" 121 | }, 122 | "display": { 123 | // Simply renders all the available fields. 124 | // This component is used by default when no display is specified. 125 | "component": "table" 126 | } 127 | } 128 | }, 129 | } 130 | ``` 131 | 132 | ## DatasetSchema object definition 133 | 134 | | Property | Type | Required | Description | 135 | | ------------------ | ---------------------------- | -------- | -------------------------------------------------------------------------------------------------- | 136 | | actorSpecification | integer | true | Specifies the version of dataset schema
structure document.
Currently only version 1 is available. | 137 | | fields | JSON schema | true | JSON schema object with more formats in the future. | 138 | | views | [DatasetView] | true | An array of objects with a description of an API
and UI views. | 139 | 140 | ### JSON schema 141 | 142 | Items of a dataset can be described by a JSON schema definition, passed into the `fields` property. 143 | The Actor system then ensures that each records added to the dataset complies with the provided schema. 144 | 145 | ```jsonc 146 | { 147 | "type": "object", 148 | "required": [ 149 | "name", 150 | "email" 151 | ], 152 | "properties": { 153 | "id": { 154 | "type": "string" 155 | }, 156 | "name": { 157 | "type": "string" 158 | }, 159 | "email": { 160 | "type": "string" 161 | }, 162 | "arr": { 163 | "type": "array", 164 | "items": { 165 | "type": "object", 166 | "required": [], 167 | "properties": { 168 | "site": { 169 | "type": "string" 170 | }, 171 | "url": { 172 | "type": "string" 173 | } 174 | } 175 | } 176 | } 177 | } 178 | } 179 | ``` 180 | 181 | 182 | ### DatasetView object definition 183 | 184 | | Property | Type | Required | Description | 185 | | -------------- | ------------------------- | -------- | ----------------------------------------------------------------------------------------------------- | 186 | | title | string | true | The title is visible in UI in the Output tab
as well as in the API. | 187 | | description | string | false | The description is only available in the API response.
The usage of this field is optional. | 188 | | transformation | ViewTransformation object | true | The definition of data transformation
is applied when dataset data are loaded from
Dataset API. | 189 | | display | ViewDisplay object | true | The definition of Output tab UI visualization. | 190 | 191 | ### ViewTransformation object definition 192 | 193 | | Property | Type | Required | Description | 194 | | -------- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 195 | | fields | string[] | true | Selects fields that are going to be presented in the output.
The order of fields matches the order of columns
in visualization UI. In case the fields value
is missing, it will be presented as “undefined” in the UI. | 196 | | unwind | string | false | Deconstructs nested children into parent object,
e.g.: with unwind:[”foo”], the object `{”foo”:{”bar”:”hello”}}`
is turned into `{’bar”:”hello”}`. | 197 | | flatten | string[] | false | Transforms nested object into flat structure.
eg: with flatten:[”foo”] the object `{”foo”:{”bar”:”hello”}}`
is turned into `{’foo.bar”:”hello”}`. | 198 | | omit | string | false | Removes the specified fields from the output.
Nested fields names can be used there as well. | 199 | | limit | integer | false | The maximum number of results returned.
Default is all results. | 200 | | desc | boolean | false | By default, results are sorted in ascending based
on the write event into the dataset. desc:true param
will return the newest writes to the dataset first. | 201 | 202 | ### ViewDisplay object definition 203 | 204 | | Property | Type | Required | Description | 205 | | ---------- | ------------------------------------------------------------------------------------------------------------------ | -------- | ---------------------------------------------------------------------------------------------------------------------------- | 206 | | component | string | true | Only component “table” is available. | 207 | | properties | Object | false | Object with keys matching the `transformation.fields`
and ViewDisplayProperty as values. In case properties are not set
the table will be rendered automatically with fields formatted as Strings,
Arrays or Objects. | 208 | 209 | ### ViewDisplayProperty object definition 210 | 211 | | Property | Type | Required | Description | 212 | | -------- | ------------------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------- | 213 | | label | string | false | In case the data are visualized as in Table view.
The label will be visible table column’s header. | 214 | | format | enum(text, number, date, link,
boolean, image, array, object) | false | Describes how output data values are formatted
in order to be rendered in the output tab UI. | 215 | -------------------------------------------------------------------------------- /pages/IDEAS.md: -------------------------------------------------------------------------------- 1 | 2 | # Sandbox for various ideas 3 | 4 | Here you can find random ideas and notes, in no particular order, relevance, or promise they will be implemented. 5 | 6 | ## TODOs 7 | 8 | 9 | - Add ideas for the permission system 10 | - Note from Marek regarding permission: 11 | - Just a note on this, I was thinking about how this could be done systematically, so dropping the notes here: 12 | - By default, the Actor should have following permissions that the user would accept when running the Actor for the first time: 13 | - Write to all the default + named storages linked in the output schema 14 | - Proxy - simply because we want all the traffic to run thru the proxy so we don't want Actors scraping directly 15 | - In `actor.json` the Actor could request additional permissions, basically anything from [permissions](https://docs.apify.com/access-rights/list-of-permissions#actor-task), for example, `DATASET.READ` to be able to read all the datasets or `SCHEDULER.WRITE` to manage schedules 16 | There is one tricky part: 17 | - If an Actor needs to `.call()` other Actors then basically the user must give him full permissions. Otherwise, the Actor would have to list all the other Actors it's going to call and the user would have to accept all the permissions needed in recursive calls. 18 | Extra question: 19 | - What to do if the new version of the Actor requires more permissions? We should probably require the author to increase a major version and keep users on the old build + email them to accept the updated permissions. 20 | 21 | - We should make env vars independent of Apify, i.e. start them with `ACTOR_`, rather then `APIFY_` 22 | 23 | - To storages, add info about atomic rename, e.g. `setName` function, and link to other operations... 24 | 25 | - Maybe add `Actor.getThisRun()` function to return run object of the current Actor. Not sure about use case... 26 | 27 | - Figure the push/build workflow, see https://github.com/apify/actor-specs/pull/7/files#r997020215 28 | / https://github.com/apify/actor-specs/pull/7#pullrequestreview-1144097598 29 | how should that work with 30 | 31 | - Would be nice to have an API that would send a message to a run and the run would get it as `.on('message', (msg) => { ... })`. Would save people from implementing their own servers in Actors. 32 | It would make it easier to orchestrate Actors. Currently it's a bit painful to create a "master" Actor and then "workers" to process some workloads. But it could probably be achieved with a queue. if it were distributed and generic. 33 | Explain why is this better than live-view HTTP API 34 | 35 | 36 | - NOTE: BTW we have a new API v3 doc with ideas for changes in API https://www.notion.so/apify/API-v3-6fcd240d9621427f9650b741ec6fa06b ? 37 | 38 | - For DATASET schema, In future versions let's consider referencing schema using URL, for now let's keep it simple 39 | 40 | 41 | 42 | ### Pipe result of an Actor to another (aka chaining) 43 | 44 | Actor can start other Actors and 45 | pass them its own dataset or key-value store. 46 | For example, the main Actor can produce files 47 | and the spawned others can consume them, from the same storages. 48 | 49 | In the future, we could let datasets be cleaned up from the beginning, 50 | effectively creating a pipe, with custom rolling window. 51 | Webhooks can be attached to storage operations, 52 | and so launch other Actors to consume newly added items or files. 53 | 54 | #### UNIX equivalent 55 | 56 | ```bash 57 | $ ls -l | grep "something" | wc -l 58 | ``` 59 | 60 | **TODO (@jancurn):** **Move to IDEAS.md** We could have a special CLI support for creating Actor chains using pipe operator, 61 | like this: 62 | 63 | ``` 64 | $ apify call apify/google-search-scraper | apify call apify/send-email queryTerms="aaa\nbbb" 65 | ``` 66 | 67 | Note from Marek: 68 | Here we will need some way how to map outputs from old Actor to inputs of the following Actor, perhaps we could pipeline thru some utility like [jq](https://stedolan.github.io/jq/tutorial/) 69 | or use some mapping like: 70 | 71 | ``` 72 | --input-dataset-id="$output.defaultDatasetId" --dataset-name="xxx" 73 | ``` 74 | 75 | Note from Ondra: 76 | I tried to write a JS example for piping, but figured that piping is not really aligned with how Actors work, because piping assumes the output of one program is immediately processed by another program. Actors can produce output like this, but they can't process input like this. Input is provided only once, when the Actor starts. Unless we consider e.g. request queue as input. We will have to think about this a bit differently. 77 | 78 | Note from Jan: 79 | Indeed, the flow is to start one Actor, and pass one of it's storages as default to the other newly started Actor. If we had a generic Queue, it could be used nicely for these use case. I'm adding these notes to the doc, so that we can get back to them later. 80 | 81 | Jan: I'd get rid of the Request queue from Actor specification, and kept it as Apify's extension only. 82 | 83 | 84 | -------------------------------------------------------------------------------- /pages/INPUT_SCHEMA.md: -------------------------------------------------------------------------------- 1 | # Actor input schema file specification 1.0 2 | 3 | This JSON file defines the schema and description of the input object accepted by the 4 | Actor (see [Input](../README.md#input) for details). 5 | The file is referenced from the main [Actor file (.actor/actor.json)](ACTOR_FILE.md) using the `input` directive, 6 | and it is typically stored in `.actor/input_schema.json`. 7 | 8 | The file is a JSON schema with our extensions describing a single Actor input object 9 | and its properties, including documentation, default value, and user interface definition. 10 | 11 | **For full reference, see [Input schema specification](https://docs.apify.com/platform/actors/development/actor-definition/input-schema/specification/v1) in Apify documentation.** 12 | 13 | 14 | 15 | 16 | ## Example Actor input schema 17 | 18 | ```jsonc 19 | { 20 | "actorInputSchemaVersion": 1, 21 | 22 | "title": "Input schema for an Actor", 23 | "description": "Enter the start URL(s) of the website(s) to crawl, configure other optional settings, and run the Actor to crawl the pages and extract their text content.", 24 | "type": "object", 25 | 26 | "properties": { 27 | 28 | "startUrls": { 29 | "title": "Start URLs", 30 | "type": "array", 31 | "description": "One or more URLs of the pages where the crawler will start. Note that the Actor will additionally only crawl sub-pages of these URLs. For example, for the start URL `https://www.example.com/blog`, it will crawl pages like `https://example.com/blog/article-1`, but will skip `https://example.com/docs/something-else`.", 32 | "editor": "requestListSources", 33 | "prefill": [{ "url": "https://docs.apify.com/" }] 34 | }, 35 | 36 | // The input value is another Dataset. The system can generate an UI to make it easy to select the dataset. 37 | "processDatasetId": { 38 | "title": "Input dataset", 39 | "type": "string", 40 | "resourceType": "dataset", 41 | "description": "Dataset to be processed by the Actor", 42 | // Optional link to dataset schema, used by the system to validate the input dataset 43 | "schema": "./input_dataset_schema.json" 44 | }, 45 | 46 | "screenshotsKeyValueStoreId": { 47 | "title": "Screenshots to process", 48 | "type": "string", 49 | "resourceType": "keyValueStore", 50 | "description": "Screenshots to be compressed", 51 | "schema": "./input_key_value_store_schema.json" 52 | }, 53 | 54 | "singleFileUrl": { 55 | "title": "Some file", 56 | "type": "string", 57 | "editor": "fileupload", 58 | "description": "Screenshots to be compressed", 59 | "schema": "./input_key_value_store_schema.json" 60 | }, 61 | 62 | "crawlerType": { 63 | "sectionCaption": "Crawler settings", 64 | "title": "Crawler type", 65 | "type": "string", 66 | "enum": ["playwright:chrome", "cheerio", "jsdom"], 67 | "enumTitles": ["Headless web browser (Chrome+Playwright)", "Raw HTTP client (Cheerio)", "Raw HTTP client with JS execution (JSDOM) (experimental!)"], 68 | "description": "Select the crawling engine:\n- **Headless web browser** (default) - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.", 69 | "default": "playwright:chrome" 70 | }, 71 | 72 | "maxCrawlDepth": { 73 | "title": "Max crawling depth", 74 | "type": "integer", 75 | "description": "The maximum number of links starting from the start URL that the crawler will recursively descend. The start URLs have a depth of 0, the pages linked directly from the start URLs have a depth of 1, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to 0, the Actor will only crawl start URLs.", 76 | "minimum": 0, 77 | "default": 20 78 | }, 79 | 80 | "maxCrawlPages": { 81 | "title": "Max pages", 82 | "type": "integer", 83 | "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.", 84 | "minimum": 0, 85 | "default": 9999999 86 | } 87 | 88 | } 89 | } 90 | ``` 91 | 92 | ## Random notes 93 | 94 | 95 | We could also add an `actor` resource type. The use case could be for example a testing Actor with three inputs: 96 | - Actor to be tested 97 | - test function containing for example Jest unit test over the output 98 | - input for the Actor 99 | 100 | ...and the testing Actor would call the given Actor with a given output and in the end execute tests if the results are correct. 101 | 102 | -------------------------------------------------------------------------------- /pages/KEY_VALUE_STORE_SCHEMA.md: -------------------------------------------------------------------------------- 1 | # Key-value store schema file specification [work in progress] 2 | 3 | This JSON file should contain schema for files stored in the key-value store, 4 | defining their name, format, or content type. 5 | 6 | **BEWARE: This is currently not implemented yet and subject to change.** 7 | 8 | ## Basic properties 9 | 10 | Key-value store schema has two main use cases described in the following examples: 11 | 12 | 1. Some Actors such as [Instagram scraper](https://apify.com/jaroslavhejlek/instagram-scraper) 13 | store multiple types of files into the key-value store. Let's say the scraper stores images and user pictures. 14 | So for each of these, we would define a prefix group called collection and allow the user to list images from a single collection in both the 15 | UI and API. 16 | 17 | ```jsonc 18 | { 19 | "collections": { 20 | "screenshots": { 21 | "name": "Post images", 22 | "keyPrefix": "images-", 23 | "contentTypes": ["image/jpeg", "image/png"] 24 | } 25 | } 26 | } 27 | ``` 28 | 29 | 2. Some Actor stores a specific record, and we want to ensure the content type to be HTML and embed it into the run view. 30 | A good example is [monitoring](https://apify.com/apify/monitoring#check-frequency) Actor that generates HTML report that we would 31 | like to embed to run view for the user once the monitoring is finished. 32 | 33 | ```jsonc 34 | { 35 | "collections": { 36 | "monitoringReport": { 37 | "name": "Monitoring report", 38 | "description": "HTML page containing monitoring results", 39 | "key": "REPORT", 40 | "contentTypes": ["text/html"] 41 | } 42 | } 43 | } 44 | ``` 45 | 46 | 3. Some Actors store a record that has a specific structure. The structure can be specified using [JSON schema](https://json-schema.org/draft-07). 47 | Contrary to dataset schema, the record in key-value store represents output that is a single item, instead of a sequence of items. But both approaches use JSON schema to describe the structure. 48 | 49 | ```jsonc 50 | { 51 | "collections": { 52 | "monitoringReportData": { 53 | "name": "Monitoring report data", 54 | "description": "JSON containing the report data", 55 | "key": "report-data.json", 56 | "contentTypes": ["application/json"], 57 | "jsonSchema": { 58 | "$schema": "http://json-schema.org/draft-07/schema#", 59 | "type": "object", 60 | "properties": { 61 | "summary": { "type": "string" }, 62 | "totalResults": { "type": "number" } 63 | } 64 | } // alternatively "jsonSchema": "./report-schema.json" can be used 65 | } 66 | } 67 | } 68 | ``` 69 | 70 | ## Structure 71 | 72 | ```jsonc 73 | { 74 | "actorKeyValueStoreSchemaVersion": 1, 75 | "name": "My Instagram backup", 76 | "description": "Backup of my Instagram account", 77 | 78 | "collections": { 79 | "postImages": { 80 | "name": "Post images", 81 | "description": "Contains all Instagram post images", 82 | "keyPrefix": "post-image-", 83 | "contentTypes": ["image/jpeg", "image/png"] 84 | }, 85 | 86 | "profilePicture": { 87 | "name": "Profile picture", 88 | "key": "profile-picture", 89 | "contentTypes": ["image/*"] // Be able to enable all images or text types etc. 90 | } 91 | } 92 | } 93 | ``` 94 | 95 | ## API implications 96 | 97 | Enable user to list keys for specific collection: 98 | 99 | ``` 100 | https://api.apify.com/v2/key-value-stores/storeId/keys?collection=postImages&exclusiveStartKey=xxx 101 | ``` 102 | 103 | In addition to this user will be able to list by prefix directly: 104 | 105 | ``` 106 | https://api.apify.com/v2/key-value-stores/storeId/keys?prefix=post-images- 107 | ``` 108 | -------------------------------------------------------------------------------- /pages/OUTPUT_SCHEMA.md: -------------------------------------------------------------------------------- 1 | # Actor output schema file specification 1.0 [work in progress] 2 | 3 | This JSON file defines the schema of the [output](../README.md#output) object produced by a web Actor. 4 | The file is referenced from the main [Actor file](./ACTOR_FILE.md) using the `output` property, 5 | and it is typically stored in `.actor/output_schema.json`. 6 | 7 | The format is a JSON Schema with our extensions, describing a single object. 8 | 9 | The output schema is used by the system to generate the 10 | output JSON object, 11 | whose fields corresponding to `properties`, where values are URLs linking to actual Actor results in a dataset, key-value store files, or live view web server. 12 | This output object is generated by system right when the Actor starts withour executing any Actor's code, 13 | and remains static over entire lifecycle of Actor; only the linked content changes over time as Actor produces the results. 14 | This is necessary to enable integrations of results to other systems, as you don't need to run an Actor 15 | to see format of its results as it's predefined by the output schema. 16 | 17 | The output schema is also used by the system to generate the user interface, API examples, integrations, etc. 18 | 19 | ## Structure 20 | 21 | ```jsonc 22 | { 23 | "actorOutputSchemaVersion": 1, 24 | 25 | "title": "Some title", 26 | "description": "This text is shown in the Output UI", 27 | "type": "object", 28 | 29 | "properties": { 30 | 31 | // This property in output object will contain a URL to the dataset containing Actor results, 32 | // for example: https://api.apify.com/v2/datasets/XYZabc/items?format=json&view=product_details 33 | "currentProductsDatasetUrl": { 34 | // Type is string, because the value in output object is a URL 35 | "type": "string", 36 | "title": "Current products", 37 | "description": "Yaddada", 38 | 39 | // Identifies what kind of object is refereced by this output property (same syntax as "resourceType" in input schema). 40 | // If used, the system will interepret the "source" and render the dataset in UI special way. 41 | "resourceType": "dataset", 42 | 43 | // Defines how the output value is created, using text format where {{x}} denote variables (same syntax as webhook templates) 44 | "template": "{{actorRun.defaultDatasetUrl}}?format=json&view=product_details", 45 | 46 | // Or reference a property from input object, the linkage will be checked for type compatibility 47 | // "template": "{{actorInput.myProductsDatasetId}}" 48 | }, 49 | 50 | // Selects a specific group of records with a certain prefix. In UI, this can be shown 51 | // as a list of images. In the output object, this will be a link to a API with "prefix" param. 52 | "productImagesUrl": { 53 | "type": "string", 54 | "title": "Product screenshots", 55 | 56 | "resourceType": "keyValueStore", 57 | 58 | // Define how the URL is created, in this case it will link to the default Actor key-value store 59 | "template": "{{actorRun.defaultKeyValueStoreUrl}}?collection=screenshots" 60 | }, 61 | 62 | // Example of reference to a file stored in Actor's default key-value store. 63 | // In UI can be rendered as a file download. 64 | "mainScreenshotFileUrl": { 65 | "type": "string", 66 | "title": "Main screenshot", 67 | "description": "URL to an image with main product screenshot.", 68 | "template": "{{actorRun.defaultKeyValueStoreUrl}}/screenshot.png", 69 | }, 70 | 71 | // Live view web server for to the Actor 72 | // In the "output" view, this page is rendered in an IFRAME 73 | "productExplorerWebUrl": { 74 | "type": "string", 75 | "resourceType": "webServer", 76 | "title": "Live product explorer app", 77 | "description": "API documentation is available in swagger.com/api/xxxx", // optional 78 | 79 | // TODO: ideally this should be named {{actorRun.webServerUrl}} for consistency, but we'd need to change ActorRun everywhere 80 | "template": "{{actorRun.containerUrl}}/product-explorer/", 81 | } 82 | } 83 | } 84 | ``` 85 | 86 | 87 | ## Random notes 88 | 89 | The output schema can reference other datasets/kv-stores/queues 90 | but only those ones that are referenced in the input, or the default. Hence 91 | there's no point to include storage schema here again, as it's done elsewhere. 92 | 93 | - **NOTE:** The output schema should enable developers to define schema for the 94 | default dataset and key-value store. But how? It should be declarative 95 | so that the system can check that e.g. the overridden default dataset 96 | has the right schema. But then, when it comes to kv-store, that's not purely 97 | output object but INPUT, similarly for overridden dataset or request queue. 98 | Perhaps the cleanest way would be to set these directly in `.actor/actor.json`. 99 | - The Run Sync API could have an option to automatically return (or redirect to?) 100 | a specific property (i.e. URL) of the output object. 101 | This would supersede the `outputRecordKey=OUTPUT` API param as well as 102 | the run-sync-get-dataset-items API endpoint. 103 | Maybe we could have one of the output properties as the main one, 104 | which would be used by default for this kind of API endpoint, and just return 105 | data to user. 106 | - Same as we show Output in UI, we need to autogenerate the OUTPUT in API e.g. JSON format. 107 | There would be properties like in the output_schema.json file, with e.g. URL to dataset, 108 | log file, kv-store, live view etc. So it would be an auto-generated field "output" 109 | that we can add to JSON returned by the Run API endpoints 110 | (e.g. https://docs.apify.com/api/v2#/reference/actor-tasks/run-collection/run-task) 111 | - Also see: https://github.com/apify/actor-specs/pull/5#discussion_r775641112 112 | - `output` will be a property of run object generated from Output schema 113 | 114 | 115 | 116 | ## Examples of ideal Actor run UI 117 | 118 | - For the majority of Actors, we want to see the dataset with new records being added in realtime 119 | - For [Google Spreadsheet Import](https://apify.com/lukaskrivka/google-sheets), 120 | we want to first display Live View for the user to set up OAUTH, and once 121 | this is set up, then we want to display the log next time. 122 | - For technical Actors, it might be a log 123 | - For [HTML to PDF convertor](https://apify.com/jancurn/url-to-pdf) it's a single record from key-value store 124 | - For [Monitoring](https://apify.com/apify/monitoring-runner) it's log during the runtime and a single HTML record in an iframe in the end 125 | - For an Actor that has failed, it might be the log 126 | 127 | ## How to define Actor run UI 128 | 129 | ### Simple version 130 | 131 | There will be a new tab on Actor run detail for every Actor with output schema called "Output". 132 | This tab will be at the first position and displayed by default. Tab will show the following: 133 | - Items from output schema with property `visible: true` will be rendered in the same order 134 | as they are in schema 135 | - The live view will be displayed only when it has `visible: true` and when it's active. 136 | Otherwise, we should show just a short message "This show is over". 137 | - If the dataset has more views then we should have some select or tabs to select the view 138 | 139 | ### Ideal most comprehensive state 140 | 141 | - Default setup, i.e., what output components should be displayed at the default run tab 142 | - Optionally, the setup for different states 143 | - Be able to pragmatically changes this using API by Actor itself 144 | -------------------------------------------------------------------------------- /pages/REQUEST_QUEUE_SCHEMA.md: -------------------------------------------------------------------------------- 1 | # Request queue schema file specification [work in progress] 2 | 3 | Currently, this is neither specified nor implemented. 4 | We think that request queue schema might be useful for two things: 5 | 6 | - ensuring what kind of URLs might be enqueued (certain domains or subdomains, ...) 7 | - ensure that for example each request has `userData.label`, i.e. schema of `userData` the same way as we enforce it for the Datasets 8 | 9 | We should consider renaming `RequestQueue` to just `Queue` and make it more generic, and then it makes sense to have request schema. 10 | 11 | **This is to be done** 12 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/amazon_scraper/.actor/ACTOR.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "amazon", 3 | "template": "puppeteer_crawler", 4 | "version": "0.1", 5 | "buildTag": "latest", 6 | "env": null 7 | } 8 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/amazon_scraper/.actor/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "scraper": { 3 | "title": "Use Browser", 4 | "type": "boolean", 5 | "description": "Keep checked in order to use a real browser in evaluating amazon", 6 | "editor": "checkbox", 7 | "default": true 8 | }, 9 | "country": { 10 | "title": "Amazon market", 11 | "type": "string", 12 | "description": "Select your Amazon domain", 13 | "editor": "select", 14 | "default": "US", 15 | "enum": [ 16 | "US", 17 | "UK", 18 | "DE", 19 | "ES", 20 | "FR", 21 | "IT", 22 | "IN", 23 | "CA", 24 | "JP", 25 | "AE", 26 | "SA", 27 | "BR", 28 | "MX", 29 | "SG", 30 | "TR", 31 | "NL", 32 | "AU", 33 | "SE" 34 | ], 35 | "enumTitles": [ 36 | "amazon.com", 37 | "amazon.co.uk", 38 | "amazon.de", 39 | "amazon.es", 40 | "amazon.fr", 41 | "amazon.it", 42 | "amazon.in", 43 | "amazon.ca", 44 | "amazon.co.jp", 45 | "amazon.ae", 46 | "amazon.sa", 47 | "amazon.com.br", 48 | "amazon.com.mx", 49 | "amazon.sg", 50 | "amazon.com.tr", 51 | "amazon.nl", 52 | "amazon.com.au", 53 | "amazon.se" 54 | ] 55 | }, 56 | "category": { 57 | "title": "Store Department", 58 | "type": "string", 59 | "description": "Select the store category", 60 | "editor": "select", 61 | "default": "aps", 62 | "enumTitles": [ 63 | "All Departments", 64 | "Arts & Crafts", 65 | "Automotive", 66 | "Baby", 67 | "Beauty & Personal Care", 68 | "Books", 69 | "Computers", 70 | "Digital Music", 71 | "Electronics", 72 | "Kindle Store", 73 | "Prime Video", 74 | "Women's Fashion", 75 | "Men's Fashion", 76 | "Girls' Fashion", 77 | "Boys' Fashion", 78 | "Deals", 79 | "Health & Household", 80 | "Home & Kitchen", 81 | "Industrial & Scientific", 82 | "Luggage", 83 | "Movies & TV", 84 | "Music, CDs & Vinyl", 85 | "Pet Supplies", 86 | "Software", 87 | "Sports & Outdoors", 88 | "Tools & Home Improvement", 89 | "Toys & Games", 90 | "Video Games" 91 | ], 92 | "enum": [ 93 | "aps", 94 | "arts-crafts-intl-ship", 95 | "automotive-intl-ship", 96 | "baby-products-intl-ship", 97 | "beauty-intl-ship", 98 | "stripbooks-intl-ship", 99 | "computers-intl-ship", 100 | "digital-music", 101 | "electronics-intl-ship", 102 | "digital-text", 103 | "instant-video", 104 | "fashion-womens-intl-ship", 105 | "fashion-mens-intl-ship", 106 | "fashion-girls-intl-ship", 107 | "fashion-boys-intl-ship", 108 | "deals-intl-ship", 109 | "hpc-intl-ship", 110 | "kitchen-intl-ship", 111 | "industrial-intl-ship", 112 | "luggage-intl-ship", 113 | "movies-tv-intl-ship", 114 | "music-intl-ship", 115 | "pets-intl-ship", 116 | "software-intl-ship", 117 | "sporting-intl-ship", 118 | "tools-intl-ship", 119 | "toys-and-games-intl-ship", 120 | "videogames-intl-ship" 121 | ] 122 | }, 123 | "searchType": { 124 | "title": "Select the type of search you would like", 125 | "type": "string", 126 | "description": "Select the type of search to perform from a choice of keywords, asins or direct Urls", 127 | "editor": "select", 128 | "default": "keywords", 129 | "enum": [ 130 | "keywords", 131 | "asins", 132 | "directUrls" 133 | ], 134 | "enumTitles": [ 135 | "Keywords", 136 | "ASINs", 137 | "Direct URLs" 138 | ] 139 | }, 140 | "search": { 141 | "title": "Search", 142 | "type": "string", 143 | "description": "Keywords, asins or directUrls you would like to extract from Amazon, comma separated", 144 | "prefill": "Iphone X,Samsung monitor 27 QHD", 145 | "editor": "textarea" 146 | }, 147 | "maxResults": { 148 | "title": "Number of results", 149 | "type": "integer", 150 | "description": "Number of results you would like to save in total.", 151 | "editor": "number" 152 | }, 153 | "proxy": { 154 | "title": "Proxy configuration", 155 | "type": "object", 156 | "description": "Select proxies to be used by your crawler.", 157 | "prefill": { 158 | "useApifyProxy": true 159 | }, 160 | "editor": "proxy" 161 | }, 162 | "maxReviews": { 163 | "title": "Number of reviews", 164 | "type": "integer", 165 | "description": "Number of reviews you would like to save per product.", 166 | "default": 0, 167 | "editor": "number" 168 | }, 169 | "delivery": { 170 | "title": "Delivery Location", 171 | "type": "string", 172 | "description": "Select the location you would like the product to be delivered to", 173 | "editor": "select", 174 | "default": "", 175 | "enum": [ 176 | "", 177 | "AU,GLUXCountryList_0", 178 | "CA,GLUXCountryList_1", 179 | "CN,GLUXCountryList_2", 180 | "JP,GLUXCountryList_3", 181 | "MX,GLUXCountryList_4", 182 | "SG,GLUXCountryList_5", 183 | "GB,GLUXCountryList_6", 184 | "AF,GLUXCountryList_7", 185 | "AX,GLUXCountryList_8", 186 | "AL,GLUXCountryList_9", 187 | "DZ,GLUXCountryList_10", 188 | "AS,GLUXCountryList_11", 189 | "AD,GLUXCountryList_12", 190 | "AO,GLUXCountryList_13", 191 | "AI,GLUXCountryList_14", 192 | "AG,GLUXCountryList_15", 193 | "AR,GLUXCountryList_16", 194 | "AM,GLUXCountryList_17", 195 | "AW,GLUXCountryList_18", 196 | "AU,GLUXCountryList_19", 197 | "AT,GLUXCountryList_20", 198 | "AZ,GLUXCountryList_21", 199 | "BS,GLUXCountryList_22", 200 | "BH,GLUXCountryList_23", 201 | "BD,GLUXCountryList_24", 202 | "BB,GLUXCountryList_25", 203 | "BY,GLUXCountryList_26", 204 | "BE,GLUXCountryList_27", 205 | "BZ,GLUXCountryList_28", 206 | "BJ,GLUXCountryList_29", 207 | "BM,GLUXCountryList_30", 208 | "BT,GLUXCountryList_31", 209 | "BO,GLUXCountryList_32", 210 | "BQ,GLUXCountryList_33", 211 | "BA,GLUXCountryList_34", 212 | "BW,GLUXCountryList_35", 213 | "BV,GLUXCountryList_36", 214 | "BR,GLUXCountryList_37", 215 | "IO,GLUXCountryList_38", 216 | "BN,GLUXCountryList_39", 217 | "BG,GLUXCountryList_40", 218 | "BF,GLUXCountryList_41", 219 | "BI,GLUXCountryList_42", 220 | "KH,GLUXCountryList_43", 221 | "CM,GLUXCountryList_44", 222 | "CA,GLUXCountryList_45", 223 | "CV,GLUXCountryList_46", 224 | "KY,GLUXCountryList_47", 225 | "CF,GLUXCountryList_48", 226 | "TD,GLUXCountryList_49", 227 | "CL,GLUXCountryList_50", 228 | "CN,GLUXCountryList_51", 229 | "CX,GLUXCountryList_52", 230 | "CC,GLUXCountryList_53", 231 | "CO,GLUXCountryList_54", 232 | "KM,GLUXCountryList_55", 233 | "CG,GLUXCountryList_56", 234 | "CD,GLUXCountryList_57", 235 | "CK,GLUXCountryList_58", 236 | "CR,GLUXCountryList_59", 237 | "CI,GLUXCountryList_60", 238 | "HR,GLUXCountryList_61", 239 | "CW,GLUXCountryList_62", 240 | "CY,GLUXCountryList_63", 241 | "CZ,GLUXCountryList_64", 242 | "DK,GLUXCountryList_65", 243 | "DJ,GLUXCountryList_66", 244 | "DM,GLUXCountryList_67", 245 | "DO,GLUXCountryList_68", 246 | "EC,GLUXCountryList_69", 247 | "EG,GLUXCountryList_70", 248 | "SV,GLUXCountryList_71", 249 | "GQ,GLUXCountryList_72", 250 | "ER,GLUXCountryList_73", 251 | "EE,GLUXCountryList_74", 252 | "ET,GLUXCountryList_75", 253 | "FK,GLUXCountryList_76", 254 | "FO,GLUXCountryList_77", 255 | "FJ,GLUXCountryList_78", 256 | "FI,GLUXCountryList_79", 257 | "FR,GLUXCountryList_80", 258 | "GF,GLUXCountryList_81", 259 | "PF,GLUXCountryList_82", 260 | "TF,GLUXCountryList_83", 261 | "GA,GLUXCountryList_84", 262 | "GM,GLUXCountryList_85", 263 | "GE,GLUXCountryList_86", 264 | "DE,GLUXCountryList_87", 265 | "GH,GLUXCountryList_88", 266 | "GI,GLUXCountryList_89", 267 | "GR,GLUXCountryList_90", 268 | "GL,GLUXCountryList_91", 269 | "GD,GLUXCountryList_92", 270 | "GP,GLUXCountryList_93", 271 | "GT,GLUXCountryList_94", 272 | "GG,GLUXCountryList_95", 273 | "GN,GLUXCountryList_96", 274 | "GW,GLUXCountryList_97", 275 | "GY,GLUXCountryList_98", 276 | "HT,GLUXCountryList_99", 277 | "HM,GLUXCountryList_100", 278 | "VA,GLUXCountryList_101", 279 | "HN,GLUXCountryList_102", 280 | "HK,GLUXCountryList_103", 281 | "HU,GLUXCountryList_104", 282 | "IS,GLUXCountryList_105", 283 | "IN,GLUXCountryList_106", 284 | "ID,GLUXCountryList_107", 285 | "IQ,GLUXCountryList_108", 286 | "IE,GLUXCountryList_109", 287 | "IM,GLUXCountryList_110", 288 | "IL,GLUXCountryList_111", 289 | "IT,GLUXCountryList_112", 290 | "JM,GLUXCountryList_113", 291 | "JP,GLUXCountryList_114", 292 | "JE,GLUXCountryList_115", 293 | "JO,GLUXCountryList_116", 294 | "KZ,GLUXCountryList_117", 295 | "KE,GLUXCountryList_118", 296 | "KI,GLUXCountryList_119", 297 | "KR,GLUXCountryList_120", 298 | "XK,GLUXCountryList_121", 299 | "KW,GLUXCountryList_122", 300 | "KG,GLUXCountryList_123", 301 | "LA,GLUXCountryList_124", 302 | "LV,GLUXCountryList_125", 303 | "LB,GLUXCountryList_126", 304 | "LS,GLUXCountryList_127", 305 | "LR,GLUXCountryList_128", 306 | "LY,GLUXCountryList_129", 307 | "LI,GLUXCountryList_130", 308 | "LT,GLUXCountryList_131", 309 | "LU,GLUXCountryList_132", 310 | "MO,GLUXCountryList_133", 311 | "MK,GLUXCountryList_134", 312 | "MG,GLUXCountryList_135", 313 | "MW,GLUXCountryList_136", 314 | "MY,GLUXCountryList_137", 315 | "MV,GLUXCountryList_138", 316 | "ML,GLUXCountryList_139", 317 | "MT,GLUXCountryList_140", 318 | "MH,GLUXCountryList_141", 319 | "MQ,GLUXCountryList_142", 320 | "MR,GLUXCountryList_143", 321 | "MU,GLUXCountryList_144", 322 | "YT,GLUXCountryList_145", 323 | "MX,GLUXCountryList_146", 324 | "FM,GLUXCountryList_147", 325 | "MD,GLUXCountryList_148", 326 | "MC,GLUXCountryList_149", 327 | "MN,GLUXCountryList_150", 328 | "ME,GLUXCountryList_151", 329 | "MS,GLUXCountryList_152", 330 | "MA,GLUXCountryList_153", 331 | "MZ,GLUXCountryList_154", 332 | "MM,GLUXCountryList_155", 333 | "NA,GLUXCountryList_156", 334 | "NR,GLUXCountryList_157", 335 | "NP,GLUXCountryList_158", 336 | "NL,GLUXCountryList_159", 337 | "AN,GLUXCountryList_160", 338 | "NC,GLUXCountryList_161", 339 | "NZ,GLUXCountryList_162", 340 | "NI,GLUXCountryList_163", 341 | "NE,GLUXCountryList_164", 342 | "NG,GLUXCountryList_165", 343 | "NU,GLUXCountryList_166", 344 | "NF,GLUXCountryList_167", 345 | "NO,GLUXCountryList_168", 346 | "OM,GLUXCountryList_169", 347 | "PK,GLUXCountryList_170", 348 | "PW,GLUXCountryList_171", 349 | "PS,GLUXCountryList_172", 350 | "PA,GLUXCountryList_173", 351 | "PG,GLUXCountryList_174", 352 | "PY,GLUXCountryList_175", 353 | "PE,GLUXCountryList_176", 354 | "PH,GLUXCountryList_177", 355 | "PN,GLUXCountryList_178", 356 | "PL,GLUXCountryList_179", 357 | "PT,GLUXCountryList_180", 358 | "QA,GLUXCountryList_181", 359 | "RE,GLUXCountryList_182", 360 | "RO,GLUXCountryList_183", 361 | "RU,GLUXCountryList_184", 362 | "RW,GLUXCountryList_185", 363 | "BL,GLUXCountryList_186", 364 | "SH,GLUXCountryList_187", 365 | "KN,GLUXCountryList_188", 366 | "LC,GLUXCountryList_189", 367 | "MF,GLUXCountryList_190", 368 | "PM,GLUXCountryList_191", 369 | "VC,GLUXCountryList_192", 370 | "WS,GLUXCountryList_193", 371 | "SM,GLUXCountryList_194", 372 | "ST,GLUXCountryList_195", 373 | "SA,GLUXCountryList_196", 374 | "SN,GLUXCountryList_197", 375 | "RS,GLUXCountryList_198", 376 | "SC,GLUXCountryList_199", 377 | "SL,GLUXCountryList_200", 378 | "SG,GLUXCountryList_201", 379 | "SX,GLUXCountryList_202", 380 | "SK,GLUXCountryList_203", 381 | "SI,GLUXCountryList_204", 382 | "SB,GLUXCountryList_205", 383 | "SO,GLUXCountryList_206", 384 | "ZA,GLUXCountryList_207", 385 | "GS,GLUXCountryList_208", 386 | "ES,GLUXCountryList_209", 387 | "LK,GLUXCountryList_210", 388 | "SR,GLUXCountryList_211", 389 | "SJ,GLUXCountryList_212", 390 | "SZ,GLUXCountryList_213", 391 | "SE,GLUXCountryList_214", 392 | "CH,GLUXCountryList_215", 393 | "TW,GLUXCountryList_216", 394 | "TJ,GLUXCountryList_217", 395 | "TZ,GLUXCountryList_218", 396 | "TH,GLUXCountryList_219", 397 | "TL,GLUXCountryList_220", 398 | "TG,GLUXCountryList_221", 399 | "TK,GLUXCountryList_222", 400 | "TO,GLUXCountryList_223", 401 | "TT,GLUXCountryList_224", 402 | "TN,GLUXCountryList_225", 403 | "TR,GLUXCountryList_226", 404 | "TM,GLUXCountryList_227", 405 | "TC,GLUXCountryList_228", 406 | "TV,GLUXCountryList_229", 407 | "UG,GLUXCountryList_230", 408 | "UA,GLUXCountryList_231", 409 | "AE,GLUXCountryList_232", 410 | "GB,GLUXCountryList_233", 411 | "UM,GLUXCountryList_234", 412 | "UY,GLUXCountryList_235", 413 | "UZ,GLUXCountryList_236", 414 | "VU,GLUXCountryList_237", 415 | "VE,GLUXCountryList_238", 416 | "VN,GLUXCountryList_239", 417 | "VG,GLUXCountryList_240", 418 | "WF,GLUXCountryList_241", 419 | "EH,GLUXCountryList_242", 420 | "YE,GLUXCountryList_243", 421 | "ZM,GLUXCountryList_244", 422 | "ZW,GLUXCountryList_245" 423 | ], 424 | "enumTitles": [ 425 | "Default", 426 | "Australia", 427 | "Canada", 428 | "China", 429 | "Japan", 430 | "Mexico", 431 | "Singapore", 432 | "United Kingdom", 433 | "Afghanistan", 434 | "Aland Islands", 435 | "Albania", 436 | "Algeria", 437 | "American Samoa", 438 | "Andorra", 439 | "Angola", 440 | "Anguilla", 441 | "Antigua and Barbuda", 442 | "Argentina", 443 | "Armenia", 444 | "Aruba", 445 | "Australia", 446 | "Austria", 447 | "Azerbaijan", 448 | "Bahamas, The", 449 | "Bahrain", 450 | "Bangladesh", 451 | "Barbados", 452 | "Belarus", 453 | "Belgium", 454 | "Belize", 455 | "Benin", 456 | "Bermuda", 457 | "Bhutan", 458 | "Bolivia", 459 | "Bonaire, Saint Eustatius and Saba", 460 | "Bosnia and Herzegovina", 461 | "Botswana", 462 | "Bouvet Island", 463 | "Brazil", 464 | "British Indian Ocean Territory", 465 | "Brunei Darussalam", 466 | "Bulgaria", 467 | "Burkina Faso", 468 | "Burundi", 469 | "Cambodia", 470 | "Cameroon", 471 | "Canada", 472 | "Cape Verde", 473 | "Cayman Islands", 474 | "Central African Republic", 475 | "Chad", 476 | "Chile", 477 | "China", 478 | "Christmas Island", 479 | "Cocos (Keeling) Islands", 480 | "Colombia", 481 | "Comoros", 482 | "Congo", 483 | "Congo, The Democratic Republic of the", 484 | "Cook Islands", 485 | "Costa Rica", 486 | "Cote D'ivoire", 487 | "Croatia", 488 | "Curaçao", 489 | "Cyprus", 490 | "Czech Republic", 491 | "Denmark", 492 | "Djibouti", 493 | "Dominica", 494 | "Dominican Republic", 495 | "Ecuador", 496 | "Egypt", 497 | "El Salvador", 498 | "Equatorial Guinea", 499 | "Eritrea", 500 | "Estonia", 501 | "Ethiopia", 502 | "Falkland Islands (Malvinas)", 503 | "Faroe Islands", 504 | "Fiji", 505 | "Finland", 506 | "France", 507 | "French Guiana", 508 | "French Polynesia", 509 | "French Southern Territories", 510 | "Gabon", 511 | "Gambia, The", 512 | "Georgia", 513 | "Germany", 514 | "Ghana", 515 | "Gibraltar", 516 | "Greece", 517 | "Greenland", 518 | "Grenada", 519 | "Guadeloupe", 520 | "Guatemala", 521 | "Guernsey", 522 | "Guinea", 523 | "Guinea-Bissau", 524 | "Guyana", 525 | "Haiti", 526 | "Heard Island and the McDonald Islands", 527 | "Holy See", 528 | "Honduras", 529 | "Hong Kong", 530 | "Hungary", 531 | "Iceland", 532 | "India", 533 | "Indonesia", 534 | "Iraq", 535 | "Ireland", 536 | "Isle of Man", 537 | "Israel", 538 | "Italy", 539 | "Jamaica", 540 | "Japan", 541 | "Jersey", 542 | "Jordan", 543 | "Kazakhstan", 544 | "Kenya", 545 | "Kiribati", 546 | "Korea, Republic of", 547 | "Kosovo", 548 | "Kuwait", 549 | "Kyrgyzstan", 550 | "Lao People's Democratic Republic", 551 | "Latvia", 552 | "Lebanon", 553 | "Lesotho", 554 | "Liberia", 555 | "Libya", 556 | "Liechtenstein", 557 | "Lithuania", 558 | "Luxembourg", 559 | "Macao", 560 | "Macedonia, The Former Yugoslav Republic of", 561 | "Madagascar", 562 | "Malawi", 563 | "Malaysia", 564 | "Maldives", 565 | "Mali", 566 | "Malta", 567 | "Marshall Islands", 568 | "Martinique", 569 | "Mauritania", 570 | "Mauritius", 571 | "Mayotte", 572 | "Mexico", 573 | "Micronesia, Federated States of", 574 | "Moldova, Republic of", 575 | "Monaco", 576 | "Mongolia", 577 | "Montenegro", 578 | "Montserrat", 579 | "Morocco", 580 | "Mozambique", 581 | "Myanmar", 582 | "Namibia", 583 | "Nauru", 584 | "Nepal", 585 | "Netherlands", 586 | "Netherlands Antilles", 587 | "New Caledonia", 588 | "New Zealand", 589 | "Nicaragua", 590 | "Niger", 591 | "Nigeria", 592 | "Niue", 593 | "Norfolk Island", 594 | "Norway", 595 | "Oman", 596 | "Pakistan", 597 | "Palau", 598 | "Palestinian Territories", 599 | "Panama", 600 | "Papua New Guinea", 601 | "Paraguay", 602 | "Peru", 603 | "Philippines", 604 | "Pitcairn", 605 | "Poland", 606 | "Portugal", 607 | "Qatar", 608 | "Reunion", 609 | "Romania", 610 | "Russian Federation", 611 | "Rwanda", 612 | "Saint Barthelemy", 613 | "Saint Helena, Ascension and Tristan da Cunha", 614 | "Saint Kitts and Nevis", 615 | "Saint Lucia", 616 | "Saint Martin", 617 | "Saint Pierre and Miquelon", 618 | "Saint Vincent and the Grenadines", 619 | "Samoa", 620 | "San Marino", 621 | "Sao Tome and Principe", 622 | "Saudi Arabia", 623 | "Senegal", 624 | "Serbia", 625 | "Seychelles", 626 | "Sierra Leone", 627 | "Singapore", 628 | "Sint Maarten", 629 | "Slovakia", 630 | "Slovenia", 631 | "Solomon Islands", 632 | "Somalia", 633 | "South Africa", 634 | "South Georgia and the South Sandwich Islands", 635 | "Spain", 636 | "Sri Lanka", 637 | "Suriname", 638 | "Svalbard and Jan Mayen", 639 | "Swaziland", 640 | "Sweden", 641 | "Switzerland", 642 | "Taiwan", 643 | "Tajikistan", 644 | "Tanzania, United Republic of", 645 | "Thailand", 646 | "Timor-leste", 647 | "Togo", 648 | "Tokelau", 649 | "Tonga", 650 | "Trinidad and Tobago", 651 | "Tunisia", 652 | "Turkey", 653 | "Turkmenistan", 654 | "Turks and Caicos Islands", 655 | "Tuvalu", 656 | "Uganda", 657 | "Ukraine", 658 | "United Arab Emirates", 659 | "United Kingdom", 660 | "United States Minor Outlying Islands", 661 | "Uruguay", 662 | "Uzbekistan", 663 | "Vanuatu", 664 | "Venezuela", 665 | "Vietnam", 666 | "Virgin Islands, British", 667 | "Wallis and Futuna", 668 | "Western Sahara", 669 | "Yemen", 670 | "Zambia", 671 | "Zimbabwe" 672 | ] 673 | } 674 | } 675 | 676 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/amazon_scraper/.actor/OUTPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "products": { 3 | // ... 4 | }, 5 | 6 | "pageScreenshots": { 7 | 8 | }, 9 | } 10 | 11 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/dataset-viewer/.actor/OUTPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "staticChart": { 3 | "title": "Static Chart", 4 | "description": "The static HTML .", 5 | "type/source": "key-value-store", 6 | "file": "view.html" // ??? 7 | }, 8 | 9 | // log, requestQueue, OUTPUT.json, ..., adhoc-webhooks? 10 | 11 | "dynamicChart": { 12 | "title": "Dynamic chart", 13 | "description": "Web browser showing rich interactive view.", 14 | // This type says that the result is available in live-view, and can be rendered as HTML 15 | "type/source": "live-view", 16 | "defaultView": true, 17 | // Perhaps better way is: 18 | "source": "live-view", 19 | "viewer": "iframe" / "modal-iframe" ??? 20 | }, 21 | } 22 | 23 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/ACTOR.json: -------------------------------------------------------------------------------- 1 | { 2 | "formatVersion": 2, 3 | // No username here, Actor can be deployed to any account 4 | "name": "google-search-scraper", 5 | // We're pushing towards having human readable names shown for Actors everywhere, 6 | // so we should probably let users define it here, even if they run this code outside of Apify. 7 | // But shall the text from here overwrite changes done manually by copywriter? Probably not, 8 | // so what's the purpose of having these here? 9 | "title": "Google Search Scraper", 10 | "description": "The 200-char description", 11 | "version": "0.0", 12 | "buildTag": "latest", 13 | "env": { 14 | "MYSQL_USER": "my_username", 15 | "MYSQL_PASSWORD": "@mySecretPassword" 16 | }, 17 | "template": "basic" 18 | } 19 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries": { 3 | "title": "Search queries or URLs", 4 | "type": "string", 5 | "description": "Google Search queries (e.g. food in NYC) and/or full URLs (e.g. https://www.google.com/search?q=food+NYC).

Enter one item per line.", 6 | "prefill": "Hotels in NYC\nRestaurants in NYC\nhttps://www.google.com/search?q=restaurants+in+NYC", 7 | "editor": "textarea", 8 | "pattern": "[^\\s]+" 9 | }, 10 | "countryCode": { 11 | "title": "Country", 12 | "type": "string", 13 | "description": "Country determines the IP address of the proxy used for the query and the Google Search domain (e.g. google.es for Spain). The values must be lower-cased ISO 3166 country codes supported by Google. By default, the Actor uses United States (google.com).

This setting only applies to Search queries, but not to URLs.", 14 | "default": "", 15 | "editor": "select", 16 | "enum": [ 17 | "", 18 | "af", 19 | "al", 20 | "dz", 21 | "as", 22 | "ad", 23 | "ao", 24 | "ai", 25 | "aq", 26 | "ag", 27 | "ar", 28 | "am", 29 | "aw", 30 | "au", 31 | "at", 32 | "az", 33 | "bs", 34 | "bh", 35 | "bd", 36 | "bb", 37 | "by", 38 | "be", 39 | "bz", 40 | "bj", 41 | "bm", 42 | "bt", 43 | "bo", 44 | "ba", 45 | "bw", 46 | "bv", 47 | "br", 48 | "io", 49 | "bn", 50 | "bg", 51 | "bf", 52 | "bi", 53 | "kh", 54 | "cm", 55 | "ca", 56 | "cv", 57 | "ky", 58 | "cf", 59 | "td", 60 | "cl", 61 | "cn", 62 | "cx", 63 | "cc", 64 | "co", 65 | "km", 66 | "cg", 67 | "cd", 68 | "ck", 69 | "cr", 70 | "ci", 71 | "hr", 72 | "cu", 73 | "cy", 74 | "cz", 75 | "dk", 76 | "dj", 77 | "dm", 78 | "do", 79 | "ec", 80 | "eg", 81 | "sv", 82 | "gq", 83 | "er", 84 | "ee", 85 | "et", 86 | "fk", 87 | "fo", 88 | "fj", 89 | "fi", 90 | "fr", 91 | "gf", 92 | "pf", 93 | "tf", 94 | "ga", 95 | "gm", 96 | "ge", 97 | "de", 98 | "gh", 99 | "gi", 100 | "gr", 101 | "gl", 102 | "gd", 103 | "gp", 104 | "gu", 105 | "gt", 106 | "gn", 107 | "gw", 108 | "gy", 109 | "ht", 110 | "hm", 111 | "va", 112 | "hn", 113 | "hk", 114 | "hu", 115 | "is", 116 | "in", 117 | "id", 118 | "ir", 119 | "iq", 120 | "ie", 121 | "il", 122 | "it", 123 | "jm", 124 | "jp", 125 | "jo", 126 | "kz", 127 | "ke", 128 | "ki", 129 | "kp", 130 | "kr", 131 | "kw", 132 | "kg", 133 | "la", 134 | "lv", 135 | "lb", 136 | "ls", 137 | "lr", 138 | "ly", 139 | "li", 140 | "lt", 141 | "lu", 142 | "mo", 143 | "mk", 144 | "mg", 145 | "mw", 146 | "my", 147 | "mv", 148 | "ml", 149 | "mt", 150 | "mh", 151 | "mq", 152 | "mr", 153 | "mu", 154 | "yt", 155 | "mx", 156 | "fm", 157 | "md", 158 | "mc", 159 | "mn", 160 | "ms", 161 | "ma", 162 | "mz", 163 | "mm", 164 | "na", 165 | "nr", 166 | "np", 167 | "nl", 168 | "an", 169 | "nc", 170 | "nz", 171 | "ni", 172 | "ne", 173 | "ng", 174 | "nu", 175 | "nf", 176 | "mp", 177 | "no", 178 | "om", 179 | "pk", 180 | "pw", 181 | "ps", 182 | "pa", 183 | "pg", 184 | "py", 185 | "pe", 186 | "ph", 187 | "pn", 188 | "pl", 189 | "pt", 190 | "pr", 191 | "qa", 192 | "re", 193 | "ro", 194 | "ru", 195 | "rw", 196 | "sh", 197 | "kn", 198 | "lc", 199 | "pm", 200 | "vc", 201 | "ws", 202 | "sm", 203 | "st", 204 | "sa", 205 | "sn", 206 | "cs", 207 | "sc", 208 | "sl", 209 | "sg", 210 | "sk", 211 | "si", 212 | "sb", 213 | "so", 214 | "za", 215 | "gs", 216 | "es", 217 | "lk", 218 | "sd", 219 | "sr", 220 | "sj", 221 | "sz", 222 | "se", 223 | "ch", 224 | "sy", 225 | "tw", 226 | "tj", 227 | "tz", 228 | "th", 229 | "tl", 230 | "tg", 231 | "tk", 232 | "to", 233 | "tt", 234 | "tn", 235 | "tr", 236 | "tm", 237 | "tc", 238 | "tv", 239 | "ug", 240 | "ua", 241 | "ae", 242 | "gb", 243 | "us", 244 | "um", 245 | "uy", 246 | "uz", 247 | "vu", 248 | "ve", 249 | "vn", 250 | "vg", 251 | "vi", 252 | "wf", 253 | "eh", 254 | "ye", 255 | "zm", 256 | "zw" 257 | ], 258 | "enumTitles": [ 259 | "United States", 260 | "Afghanistan", 261 | "Albania", 262 | "Algeria", 263 | "American Samoa", 264 | "Andorra", 265 | "Angola", 266 | "Anguilla", 267 | "Antarctica", 268 | "Antigua and Barbuda", 269 | "Argentina", 270 | "Armenia", 271 | "Aruba", 272 | "Australia", 273 | "Austria", 274 | "Azerbaijan", 275 | "Bahamas", 276 | "Bahrain", 277 | "Bangladesh", 278 | "Barbados", 279 | "Belarus", 280 | "Belgium", 281 | "Belize", 282 | "Benin", 283 | "Bermuda", 284 | "Bhutan", 285 | "Bolivia", 286 | "Bosnia and Herzegovina", 287 | "Botswana", 288 | "Bouvet Island", 289 | "Brazil", 290 | "British Indian Ocean Territory", 291 | "Brunei Darussalam", 292 | "Bulgaria", 293 | "Burkina Faso", 294 | "Burundi", 295 | "Cambodia", 296 | "Cameroon", 297 | "Canada", 298 | "Cape Verde", 299 | "Cayman Islands", 300 | "Central African Republic", 301 | "Chad", 302 | "Chile", 303 | "China", 304 | "Christmas Island", 305 | "Cocos (Keeling) Islands", 306 | "Colombia", 307 | "Comoros", 308 | "Congo", 309 | "Congo, the Democratic Republic of the", 310 | "Cook Islands", 311 | "Costa Rica", 312 | "Cote D'ivoire", 313 | "Croatia", 314 | "Cuba", 315 | "Cyprus", 316 | "Czech Republic", 317 | "Denmark", 318 | "Djibouti", 319 | "Dominica", 320 | "Dominican Republic", 321 | "Ecuador", 322 | "Egypt", 323 | "El Salvador", 324 | "Equatorial Guinea", 325 | "Eritrea", 326 | "Estonia", 327 | "Ethiopia", 328 | "Falkland Islands (Malvinas)", 329 | "Faroe Islands", 330 | "Fiji", 331 | "Finland", 332 | "France", 333 | "French Guiana", 334 | "French Polynesia", 335 | "French Southern Territories", 336 | "Gabon", 337 | "Gambia", 338 | "Georgia", 339 | "Germany", 340 | "Ghana", 341 | "Gibraltar", 342 | "Greece", 343 | "Greenland", 344 | "Grenada", 345 | "Guadeloupe", 346 | "Guam", 347 | "Guatemala", 348 | "Guinea", 349 | "Guinea-Bissau", 350 | "Guyana", 351 | "Haiti", 352 | "Heard Island and Mcdonald Islands", 353 | "Holy See (Vatican City State)", 354 | "Honduras", 355 | "Hong Kong", 356 | "Hungary", 357 | "Iceland", 358 | "India", 359 | "Indonesia", 360 | "Iran, Islamic Republic of", 361 | "Iraq", 362 | "Ireland", 363 | "Israel", 364 | "Italy", 365 | "Jamaica", 366 | "Japan", 367 | "Jordan", 368 | "Kazakhstan", 369 | "Kenya", 370 | "Kiribati", 371 | "Korea, Democratic People's Republic of", 372 | "Korea, Republic of", 373 | "Kuwait", 374 | "Kyrgyzstan", 375 | "Lao People's Democratic Republic", 376 | "Latvia", 377 | "Lebanon", 378 | "Lesotho", 379 | "Liberia", 380 | "Libyan Arab Jamahiriya", 381 | "Liechtenstein", 382 | "Lithuania", 383 | "Luxembourg", 384 | "Macao", 385 | "Macedonia, the Former Yugosalv Republic of", 386 | "Madagascar", 387 | "Malawi", 388 | "Malaysia", 389 | "Maldives", 390 | "Mali", 391 | "Malta", 392 | "Marshall Islands", 393 | "Martinique", 394 | "Mauritania", 395 | "Mauritius", 396 | "Mayotte", 397 | "Mexico", 398 | "Micronesia, Federated States of", 399 | "Moldova, Republic of", 400 | "Monaco", 401 | "Mongolia", 402 | "Montserrat", 403 | "Morocco", 404 | "Mozambique", 405 | "Myanmar", 406 | "Namibia", 407 | "Nauru", 408 | "Nepal", 409 | "Netherlands", 410 | "Netherlands Antilles", 411 | "New Caledonia", 412 | "New Zealand", 413 | "Nicaragua", 414 | "Niger", 415 | "Nigeria", 416 | "Niue", 417 | "Norfolk Island", 418 | "Northern Mariana Islands", 419 | "Norway", 420 | "Oman", 421 | "Pakistan", 422 | "Palau", 423 | "Palestinian Territory, Occupied", 424 | "Panama", 425 | "Papua New Guinea", 426 | "Paraguay", 427 | "Peru", 428 | "Philippines", 429 | "Pitcairn", 430 | "Poland", 431 | "Portugal", 432 | "Puerto Rico", 433 | "Qatar", 434 | "Reunion", 435 | "Romania", 436 | "Russian Federation", 437 | "Rwanda", 438 | "Saint Helena", 439 | "Saint Kitts and Nevis", 440 | "Saint Lucia", 441 | "Saint Pierre and Miquelon", 442 | "Saint Vincent and the Grenadines", 443 | "Samoa", 444 | "San Marino", 445 | "Sao Tome and Principe", 446 | "Saudi Arabia", 447 | "Senegal", 448 | "Serbia and Montenegro", 449 | "Seychelles", 450 | "Sierra Leone", 451 | "Singapore", 452 | "Slovakia", 453 | "Slovenia", 454 | "Solomon Islands", 455 | "Somalia", 456 | "South Africa", 457 | "South Georgia and the South Sandwich Islands", 458 | "Spain", 459 | "Sri Lanka", 460 | "Sudan", 461 | "Suriname", 462 | "Svalbard and Jan Mayen", 463 | "Swaziland", 464 | "Sweden", 465 | "Switzerland", 466 | "Syrian Arab Republic", 467 | "Taiwan, Province of China", 468 | "Tajikistan", 469 | "Tanzania, United Republic of", 470 | "Thailand", 471 | "Timor-Leste", 472 | "Togo", 473 | "Tokelau", 474 | "Tonga", 475 | "Trinidad and Tobago", 476 | "Tunisia", 477 | "Turkey", 478 | "Turkmenistan", 479 | "Turks and Caicos Islands", 480 | "Tuvalu", 481 | "Uganda", 482 | "Ukraine", 483 | "United Arab Emirates", 484 | "United Kingdom", 485 | "United States", 486 | "United States Minor Outlying Islands", 487 | "Uruguay", 488 | "Uzbekistan", 489 | "Vanuatu", 490 | "Venezuela", 491 | "Viet Nam", 492 | "Virgin Islands, British", 493 | "Virgin Islands, U.S.", 494 | "Wallis and Futuna", 495 | "Western Sahara", 496 | "Yemen", 497 | "Zambia", 498 | "Zimbabwe" 499 | ] 500 | }, 501 | "languageCode": { 502 | "title": "Language", 503 | "type": "string", 504 | "description": "Language for the search results, which is passed to Google Search as the hl URL query parameter. Only set this if you want to use a non-default language for the selected country. The values must be lower-cased ISO 639 language codes supported by Google.

This setting only applies to Search queries, but not to URLs.", 505 | "default": "", 506 | "editor": "select", 507 | "enum": [ 508 | "", 509 | "af", 510 | "sq", 511 | "sm", 512 | "ar", 513 | "az", 514 | "eu", 515 | "be", 516 | "bn", 517 | "bh", 518 | "bs", 519 | "bg", 520 | "ca", 521 | "zh-CN", 522 | "zh-TW", 523 | "hr", 524 | "cs", 525 | "da", 526 | "nl", 527 | "en", 528 | "eo", 529 | "et", 530 | "fo", 531 | "fi", 532 | "fr", 533 | "fy", 534 | "gl", 535 | "ka", 536 | "de", 537 | "el", 538 | "gu", 539 | "iw", 540 | "hi", 541 | "hu", 542 | "is", 543 | "id", 544 | "ia", 545 | "ga", 546 | "it", 547 | "ja", 548 | "jw", 549 | "kn", 550 | "ko", 551 | "la", 552 | "lv", 553 | "lt", 554 | "mk", 555 | "ms", 556 | "ml", 557 | "mt", 558 | "mr", 559 | "ne", 560 | "no", 561 | "nn", 562 | "oc", 563 | "fa", 564 | "pl", 565 | "pt-BR", 566 | "pt-PT", 567 | "pa", 568 | "ro", 569 | "ru", 570 | "gd", 571 | "sr", 572 | "si", 573 | "sk", 574 | "sl", 575 | "es", 576 | "su", 577 | "sw", 578 | "sv", 579 | "tl", 580 | "ta", 581 | "te", 582 | "th", 583 | "ti", 584 | "tr", 585 | "uk", 586 | "ur", 587 | "uz", 588 | "vi", 589 | "cy", 590 | "xh", 591 | "zu" 592 | ], 593 | "enumTitles": [ 594 | "Default", 595 | "Afrikaans", 596 | "Albanian", 597 | "Amharic", 598 | "Arabic", 599 | "Azerbaijani", 600 | "Basque", 601 | "Belarusian", 602 | "Bengali", 603 | "Bihari", 604 | "Bosnian", 605 | "Bulgarian", 606 | "Catalan", 607 | "Chinese (Simplified)", 608 | "Chinese (Traditional)", 609 | "Croatian", 610 | "Czech", 611 | "Danish", 612 | "Dutch", 613 | "English", 614 | "Esperanto", 615 | "Estonian", 616 | "Faroese", 617 | "Finnish", 618 | "French", 619 | "Frisian", 620 | "Galician", 621 | "Georgian", 622 | "German", 623 | "Greek", 624 | "Gujarati", 625 | "Hebrew", 626 | "Hindi", 627 | "Hungarian", 628 | "Icelandic", 629 | "Indonesian", 630 | "Interlingua", 631 | "Irish", 632 | "Italian", 633 | "Japanese", 634 | "Javanese", 635 | "Kannada", 636 | "Korean", 637 | "Latin", 638 | "Latvian", 639 | "Lithuanian", 640 | "Macedonian", 641 | "Malay", 642 | "Malayam", 643 | "Maltese", 644 | "Marathi", 645 | "Nepali", 646 | "Norwegian", 647 | "Norwegian (Nynorsk)", 648 | "Occitan", 649 | "Persian", 650 | "Polish", 651 | "Portuguese (Brazil)", 652 | "Portuguese (Portugal)", 653 | "Punjabi", 654 | "Romanian", 655 | "Russian", 656 | "Scots Gaelic", 657 | "Serbian", 658 | "Sinhalese", 659 | "Slovak", 660 | "Slovenian", 661 | "Spanish", 662 | "Sudanese", 663 | "Swahili", 664 | "Swedish", 665 | "Tagalog", 666 | "Tamil", 667 | "Telugu", 668 | "Thai", 669 | "Tigrinya", 670 | "Turkish", 671 | "Ukrainian", 672 | "Urdu", 673 | "Uzbek", 674 | "Vietnamese", 675 | "Welsh", 676 | "Xhosa", 677 | "Zulu" 678 | ] 679 | }, 680 | "locationUule": { 681 | "title": "UULE location code", 682 | "type": "string", 683 | "description": "The code for geolocation of search results. It's passed to Google Search as the uule URL query parameter. You can use the UULE code generator. Learn more about emulating local search.

This setting only applies to Search queries, but not to URLs.", 684 | "editor": "textfield" 685 | }, 686 | "resultsPerPage": { 687 | "title": "Results per page", 688 | "type": "integer", 689 | "description": "Number of search results per page. By default, Google Search returns 10 results. The allowed values are: 10, 20, 30, 40, 50 and 100.

This setting only applies to Search queries, but not to URLs.", 690 | "maximum": 100, 691 | "minimum": 1 692 | }, 693 | "maxPagesPerQuery": { 694 | "title": "Max pages per query", 695 | "type": "integer", 696 | "description": "The maximum number of search result pages crawled for each search query or URL. Note that a value greater than one might significantly slow down the Actor.", 697 | "default": 1, 698 | "minimum": 1 699 | }, 700 | "customDataFunction": { 701 | "title": "Custom data function", 702 | "type": "string", 703 | "description": "Custom JavaScript function to extract additional attributes from the HTML of the result pages. The function accepts the same parameters as the handlePageFunction of the CheerioCrawler in Apify SDK. The return value of the function is saved to the results as the customData property.", 704 | "editor": "javascript", 705 | "prefill": "async ({ input, $, request, response, html }) => {\n return {\n pageTitle: $('title').text(),\n };\n};" 706 | }, 707 | "maxConcurrency": { 708 | "title": "Max concurrency", 709 | "type": "integer", 710 | "description": "The maximum number of search results pages the crawler will load in parallel. A higher number means you will get your results faster, but also it will burn through your available proxies quicker.", 711 | "default": 10, 712 | "maximum": 100, 713 | "minimum": 1 714 | }, 715 | "saveHtml": { 716 | "title": "Save HTML to dataset", 717 | "type": "boolean", 718 | "description": "If checked, the HTML of Google Search results pages will be stored to the default dataset, under the html property. This is useful if you need to process the HTML, but it makes the dataset large and reduces performance.", 719 | "default": false, 720 | "groupCaption": "Options" 721 | }, 722 | "saveHtmlToKeyValueStore": { 723 | "title": "Save HTML to key-value store", 724 | "type": "boolean", 725 | "description": "If checked, the HTML of the Google Search results pages will be stored to the default key-value store and links to the files stored to the dataset under the htmlSnapshotUrl property. This is useful for debugging, since you can easily view the the pages in browser, but use of this feature has some performance penalty.", 726 | "default": false 727 | }, 728 | "mobileResults": { 729 | "title": "Mobile results", 730 | "type": "boolean", 731 | "description": "If checked, the crawler will return results for the mobile version of the Google Search. By default, desktop results are returned.", 732 | "default": false 733 | }, 734 | "includeUnfilteredResults": { 735 | "title": "Include unfiltered results", 736 | "type": "boolean", 737 | "description": "If checked, the lower quality results that Google normally filters out will be included. Usually it is few hundred extra results.", 738 | "default": false 739 | } 740 | } 741 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/OUTPUT.json: -------------------------------------------------------------------------------- 1 | // Example of OUTPUT.json file, automatically generated by system according to OUTPUT_SCHEMA.json 2 | // This could be the result for run Actor sync/async API endpoint, in case there is no OUTPUT.json ? 3 | 4 | { 5 | "searchResults": "https://api.apify.com/v2/datasets/[DEFAULT_DATASET_ID]/items?format=[FORMAT]", 6 | 7 | "pageHtmlSnapshots": "https://api.apify.com/v2/key-value-stores/[DEFAULT_DATASET_ID]/records?prefix=[PREFIX]", 8 | 9 | "pageScreenshots": "https://api.apify.com/v2/key-value-stores/[DEFAULT_DATASET_ID]/records?prefix=[FORMAT]", 10 | 11 | "demo": "liveviewUrl when Actor runs" 12 | } 13 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/OUTPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | // The system will generate OUTPUT.json file and save it there right away, according to the schema below, 3 | // so the consumers can read it right away after start of Actor. This is done before writing INPUT.json 4 | "searchResults": { 5 | "title": "Search results", 6 | "description": "The main results of the Actor, each record is one Google SERPs page.", 7 | "type": "dataset", // or "default-dataset" ? 8 | // How to tell the system it should use default dataset, or named? Or what if Actor caller 9 | // could say which dataset/kv-store should be used for the run (new or existing with name), 10 | // and "dataset" here would just mean that dataset produced by the Actor. 11 | "default": true, 12 | "schemaFile": "./schemas/GOOGLE_SERPS_DATASET_SCHEMA.json" 13 | }, 14 | 15 | "pageHtmlSnapshots": { 16 | "title": "Page HTML snapshots", 17 | "description": "Saved snapshots of the search result pages. Only available if enabled on input. Useful for testing, to review the original source.", 18 | "type/source/target/location": "key-value-store", // or "default-key-value-store" ? 19 | // Tells filter for the records to show/return, we'd have to add "perfix" query param to kv-store API 20 | "prefix": "SNAPSHOT-html-", 21 | // How to enforce???? Maybe future 22 | "contentType": "image/png" 23 | }, 24 | 25 | "pageScreenshots": { 26 | "title": "Page screenshots", 27 | "description": "Saved screenshots of the search result pages. Only available if enabled on input. Useful for testing, to review the original source.", 28 | "type": "key-value-store", // or "default-key-value-store" ? 29 | "prefix": "SCREENSHOT-png-" 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/schemas/GOOGLE_SERPS_DATASET_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Google Search Results", 3 | "description": "Data about Google Search results yadadada", 4 | 5 | // This is basically a dataset schema here (so maybe call it "schema"?): 6 | "fields": { 7 | "url": "string", 8 | "hasNextPage": "boolean", 9 | "resultsTotal": "number", 10 | "relatedQueries": "array", 11 | "searchQuery": { 12 | "term": "string", 13 | "page": "number", 14 | "type": "string", 15 | "domain": "string", 16 | "countryCode": "string", 17 | "languageCode": "string", 18 | "locationUule": "string", 19 | "resultsPerPage": "number" 20 | }, 21 | "organicResults": "array", 22 | "organicResults.$": { 23 | // ... xxx 24 | }, 25 | "paidResults": "array", 26 | // Or use this syntax? 27 | "searchQuery": "object", 28 | "searchQuery.term": { 29 | "type": "string", 30 | "optional": true 31 | }, 32 | "searchQuery.page": "number", 33 | "searchQuery.type": "string", 34 | "searchQuery.domain": "string", 35 | "searchQuery.countryCode": "string", 36 | "searchQuery.languageCode": "string", 37 | "searchQuery.locationUule": "string", 38 | "searchQuery.resultsPerPage": "number" 39 | }, 40 | // This should tell the output consumers how to render or preview the data. 41 | "views": { 42 | "default": { 43 | "name": "All SERPs grouped by page", 44 | "fields": "searchQuery.term,organicResults", 45 | "descending": true, 46 | "format": "html" // Is this needed? The consumer of output should pick the format 47 | }, 48 | "raw": { 49 | "name": "All search results", 50 | "unwind": "organicResults", 51 | "fields": "searchQuery,organicResults" 52 | } 53 | }, 54 | 55 | // Similar to prefill, not sure about this, who will fill this? 56 | // Maybe go back to example run, or just dataset URL? 57 | "example": [{ 58 | "url": "http://google.com", 59 | "searchQuery": "ffwef", 60 | ... 61 | }] 62 | } 63 | -------------------------------------------------------------------------------- /pages/tmp_schema_experiments/google_search_scraper/.ACTOR/schemas/GOOGLE_SERPS_SCREENSHOTS_KV_STORE_SCHEMA.json: -------------------------------------------------------------------------------- 1 | 2 | 3 | // TODO: Does this make sense??? What for??? Dataset is different than KV-store... 4 | // Maybe it would be useful that it would: 5 | // 1) Check the new records have a correct content-type when storing them and fail otherwise 6 | // 2) Somehow render the view differently??? 7 | // IMHO boths cases are quite weak, plus we'd need to allow multiple such schemas per KV-store (unlike KV-store), 8 | // so I'd skip this. 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-frontmatter>=1.0.0 2 | markdown>=3.4.0 3 | regex>=2023.0.0 4 | -------------------------------------------------------------------------------- /scripts/md2mdx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Markdown to MDX Transformer 5 | --------------------------- 6 | 7 | This script transforms a standard Markdown file into an MDX file for use with Astro. 8 | It performs several transformations: 9 | 10 | 1. Processes ASTRO comments into component tags: 11 | - CodeSwitcher and CodeExample components. 12 | - Illustration, Diagram, and Picture components. 13 | - Removes redundant titles in code blocks. 14 | 2. Transforms image references to Astro Picture components 15 | 3. Removes table of contents (The Astro site has its own table of contents). 16 | 4. Adds GitHub header. 17 | 5. Removes bold formatting from fully bold lines. 18 | 6. Transforms schema file links to proper paths. 19 | 20 | Usage: 21 | The script reads from README.md in the project root and outputs to: 22 | sync/target/src/content/pages/index.mdx 23 | 24 | $ python3 scripts/md2mdx.py 25 | 26 | Dependencies: 27 | - python-frontmatter 28 | - pathlib 29 | - re (regex) 30 | """ 31 | 32 | import os 33 | import sys 34 | from pathlib import Path 35 | import frontmatter 36 | import re 37 | import glob 38 | import argparse 39 | 40 | 41 | def parse_args(): 42 | """Parse command line arguments.""" 43 | 44 | parser = argparse.ArgumentParser(description='Transform Markdown files to MDX format.') 45 | parser.add_argument('--source', type=str, help='Source directory containing markdown files') 46 | parser.add_argument('--target', type=str, help='Target directory for MDX files') 47 | 48 | return parser.parse_args() 49 | 50 | 51 | # Get the project root - need to handle both direct run and test-sync run. 52 | SCRIPT_PATH = Path(__file__).resolve() 53 | args = parse_args() 54 | 55 | print(f'Script location: {__file__}') 56 | 57 | if args.source and args.target: 58 | SOURCE_ROOT = Path(args.source).resolve() 59 | TARGET_ROOT = Path(args.target).resolve() 60 | else: 61 | PROJECT_ROOT = SCRIPT_PATH.parent.parent 62 | SOURCE_ROOT = PROJECT_ROOT / 'sync/source' 63 | TARGET_ROOT = PROJECT_ROOT / 'sync/target' 64 | 65 | print(f'Source root: {SOURCE_ROOT}') 66 | print(f'Target root: {TARGET_ROOT}') 67 | 68 | # Required imports for the MDX file. 69 | ASTRO_IMPORTS = '''import { Picture } from 'astro:assets'; 70 | import CodeExample from '../../components/CodeExample.astro'; 71 | import CodeSwitcher from '../../components/CodeSwitcher.astro'; 72 | import Diagram from '../../components/Diagram.astro'; 73 | import GitHubHeader from '../../components/GitHubHeader.astro'; 74 | import illuApifyStore from './illu-apify-store@2x.png'; 75 | import illuAPIGetInput from './illu-get-input@2x.png'; 76 | import illuAPIKeyValueStoreAccess from './illu-api-key-value-store-access@2x.png'; 77 | import illuAPIMetamorph from './illu-api-metamorph@2x.gif'; 78 | import illuAPIPush from './illu-api-push@2x.gif'; 79 | import illuAPIReboot from './illu-api-reboot@2x.png'; 80 | import illuAPIStartAnother from './illu-api-start-another@2x.png'; 81 | import illuAPIWebServer from './illu-api-webserver@2x.gif'; 82 | import illuBasicConceptsInput from './illu-basic-concepts-input@2x.gif'; 83 | import illuBasicConceptsIntegrations from './illu-basic-concepts-integrations@2x.png'; 84 | import illuBasicConceptsOutput from './illu-basic-concepts-output@2x.gif'; 85 | import illuBasicConceptsRunEnvironment from './illu-basic-concepts-docker@2x.gif'; 86 | import illuBasicConceptsStorage from './illu-basic-concepts-storage@2x.png'; 87 | import illuBasicConceptsStorageDataset from './illu-basic-concepts-storage-dataset@2x.png'; 88 | import illuBasicConceptsStorageKeyValueStore from './illu-basic-concepts-storage-key-value-store@2x.png'; 89 | import illuDatasetSchema from './illu-dataset-schema@2x.png'; 90 | import illuDefinitionFilesInputSchemaFile from './illu-definition-files-input-schema-file@2x.png'; 91 | import illuDefinitionFilesOutputSchemaFile from './illu-definition-files-output-schema-file@2x.png'; 92 | import illuDevelopmentDeployment from './illu-development-deployment@2x.png'; 93 | import illuDevelopmentLocal from './illu-development-local@2x.png'; 94 | import illuDiagramHoriz from './illu-diagram-horiz@2x.png'; 95 | import illuDiagramVert from './illu-diagram-vert@2x.png'; 96 | import illuPhilosophyWhyTheName from './illu-philosophy-why-the-name@2x.png'; 97 | import illuSharingChargingMoney from './illu-sharing-charging-money@2x.gif'; 98 | import illuSharingMonetization from './illu-sharing-monetization@2x.png'; 99 | import Illustration from '../../components/Illustration.astro'; 100 | import illuTakerInput from './illu-taker-input@2x.png';''' 101 | 102 | 103 | IGNORED_FILES = { 104 | 'license.md', # ignore case-insensitive 105 | # Add more files here as needed, e.g.: 106 | # 'contributing.md', 107 | # 'changelog.md', 108 | } 109 | 110 | 111 | def should_process_file(path: Path) -> bool: 112 | """Determine if a file should be processed based on ignore rules.""" 113 | 114 | # Case-insensitive filename check. 115 | if path.name.lower() in IGNORED_FILES: 116 | print(f'\n󰋼 Skipping ignored file: {path.name}') 117 | return False 118 | return True 119 | 120 | 121 | def remove_table_of_contents(content: str) -> str: 122 | """Remove the table of contents section from the markdown content.""" 123 | 124 | print('\n󰋼 Removing table of contents...') 125 | 126 | def replace_toc(match): 127 | print(' ⭮ Removed table of contents section') 128 | return '' 129 | 130 | return re.sub( 131 | r'## Contents\n\n[\s\S]*?', 132 | replace_toc, 133 | content 134 | ) 135 | 136 | 137 | def transform_image_references(content: str) -> str: 138 | """Transform markdown image references to Astro Picture components.""" 139 | 140 | print('\n󰋼 Transforming image references...') 141 | 142 | def replace_image(match): 143 | alt, src = match.groups() 144 | print(f' ⭮ {src}') 145 | basename = os.path.basename(src) 146 | return f'' 147 | 148 | return re.sub( 149 | r'!\[(.*?)\]\((.*?)\)', 150 | replace_image, 151 | content 152 | ) 153 | 154 | 155 | def add_github_header(content: str, is_readme: bool) -> str: 156 | """Add GitHub header component after the first heading, but only for README.md.""" 157 | 158 | if not is_readme: 159 | return content 160 | 161 | print('\n󰋼 Adding GitHub header...') 162 | print(' ⭮ Adding GitHub header') 163 | return re.sub( 164 | r'(#\s+[^\n]*\n)(\n?)', 165 | r'\1\n\n\n', 166 | content, 167 | count=1 168 | ) 169 | 170 | 171 | def remove_bold_formatting(content: str) -> str: 172 | """Remove bold formatting from lines that are entirely bold.""" 173 | 174 | print('\n󰋼 Removing bold formatting...') 175 | 176 | def replace_bold(match): 177 | text = match.group(1) 178 | print(f' ⭮ {text[:120]}') 179 | return text 180 | 181 | return re.sub( 182 | r'^\*\*(.*?)\*\*$', 183 | replace_bold, 184 | content, 185 | flags=re.MULTILINE 186 | ) 187 | 188 | 189 | def remove_picture_components(content: str) -> str: 190 | """Remove Picture components that aren't preceded by ASTRO comments.""" 191 | 192 | print('\n󰋼 Removing Picture components...') 193 | 194 | def replace_picture(match): 195 | picture = re.sub(r'\s+', ' ', match.group(0)) 196 | print(f' ⭮ {picture[:120]}') 197 | return '' 198 | 199 | return re.sub( 200 | r'(?', 201 | replace_picture, 202 | content, 203 | flags=re.MULTILINE | re.DOTALL 204 | ) 205 | 206 | 207 | def transform_astro_blocks(content: str) -> str: 208 | """Transform ASTRO comments into component tags. 209 | 210 | This function processes: 211 | 1. CodeSwitcher and CodeExample components, removing redundant titles. 212 | 2. Illustration, Diagram and Picture components. 213 | """ 214 | 215 | print('\n󰋼 Transforming ASTRO blocks...') 216 | 217 | def replace_astro_block(match): 218 | # Get the component definition but preserve internal whitespace. 219 | component = match.group(1).strip() 220 | 221 | # Handle CodeSwitcher tags. 222 | if component == '': 223 | print(' ⭮ Adding CodeSwitcher opening tag') 224 | return '' 225 | elif component == '': 226 | print(' ⭮ Adding CodeSwitcher closing tag') 227 | return '' 228 | 229 | # Handle CodeExample tags with titles. 230 | code_example_match = re.match(r'', component) 231 | 232 | if code_example_match: 233 | title = code_example_match.group(1) 234 | print(f' ⭮ Adding CodeExample tag with title: {title}') 235 | return f'' 236 | elif component == '': 237 | print(' ⭮ Adding CodeExample closing tag') 238 | return '' 239 | 240 | # Handle media components (Illustration, Diagram, Picture). 241 | if (component.startswith('' 249 | 250 | # First transform all ASTRO comments to their respective components. 251 | content = re.sub( 252 | r'', 253 | replace_astro_block, 254 | content, 255 | flags=re.MULTILINE | re.DOTALL 256 | ) 257 | 258 | # Then remove redundant h3/h4 titles that appear right after CodeExample tags. 259 | def remove_redundant_titles(match): 260 | block = match.group(0) 261 | 262 | # Match any h3 or h4 heading after the opening tag, including across newlines. 263 | block = re.sub( 264 | r'(]+>)(\s*\n)*\s*#{3,4}[^\n]+\n', 265 | lambda m: print(' ⭮ Removing heading after CodeExample') or m.group(1) + '\n', 266 | block, 267 | count=1 # Only remove the first heading found 268 | ) 269 | 270 | return block 271 | 272 | # Process each CodeExample block to remove redundant titles. 273 | content = re.sub( 274 | r']+>[\s\S]+?', 275 | remove_redundant_titles, 276 | content 277 | ) 278 | 279 | return content 280 | 281 | 282 | def transform_schema_links(content: str) -> str: 283 | """Transform schema file links to their proper paths.""" 284 | 285 | print('\n󰋼 Transforming schema links...') 286 | 287 | def replace_link(match, suffix_lower): 288 | text, path = match.groups() 289 | new_path = f'/{path.lower().replace("_", "-")}-{suffix_lower}' 290 | print(f' ⭮ {text} → {new_path}') 291 | return f'[{text}]({new_path})' 292 | 293 | # Define patterns for both schema and file links. 294 | replacements = { 295 | r'\[([^]]+)\]\(./pages/([^)]+)_SCHEMA\.md\)': 296 | lambda m: replace_link(m, 'schema'), 297 | r'\[([^]]+)\]\(./pages/([^)]+)_FILE\.md\)': 298 | lambda m: replace_link(m, 'file') 299 | } 300 | 301 | # Apply each replacement pattern. 302 | for pattern, replacement in replacements.items(): 303 | content = re.sub(pattern, replacement, content) 304 | 305 | return content 306 | 307 | 308 | def remove_html_comments(content: str) -> str: 309 | """Remove all HTML comments from the content.""" 310 | 311 | print('\n󰋼 Removing HTML comments...') 312 | 313 | def replace_comment(match): 314 | comment = match.group(0) 315 | print(f' ⭮ Removing comment: {comment[:120]}') 316 | return '' 317 | 318 | return re.sub( 319 | r'', 320 | replace_comment, 321 | content, 322 | flags=re.MULTILINE | re.DOTALL 323 | ) 324 | 325 | 326 | def transform_internal_links(content: str) -> str: 327 | """Transform internal markdown links to the new MDX format.""" 328 | 329 | print('\n󰋼 Transforming internal links...') 330 | 331 | def format_link_text(text): 332 | """Convert technical names to readable titles.""" 333 | 334 | # Remove file extensions. 335 | text = re.sub(r'\.(json|md)$', '', text) 336 | 337 | # Handle special cases. 338 | if text == 'README': 339 | return 'Documentation' 340 | 341 | # Convert UPPER_CASE to Title Case. 342 | if text.isupper(): 343 | words = text.split('_') 344 | return ' '.join(word.capitalize() for word in words) 345 | 346 | return text 347 | 348 | def replace_link(match): 349 | text, path, anchor = match.groups() 350 | 351 | # Handle different link types. 352 | if path: 353 | # Remove .md extension if present. 354 | path = path.replace('.md', '') 355 | 356 | if path == '../README': 357 | # Links to README become root links. 358 | new_path = '/' 359 | else: 360 | # Remove ./ or / prefix if present. 361 | path = path.lstrip('./').lstrip('/') 362 | 363 | # Convert to kebab case. 364 | new_path = '/' + path.lower().replace('_', '-') 365 | else: 366 | new_path = '' 367 | 368 | # Add anchor if present. 369 | if anchor: 370 | new_path = f"{new_path}{anchor}" 371 | 372 | # Format the link text if it's a technical name. 373 | if text.endswith('.md') or text.endswith('.json') or text.isupper() or '.json' in text: 374 | text = format_link_text(text) 375 | 376 | print(f' ⭮ {text} → {new_path}') 377 | return f'[{text}]({new_path})' 378 | 379 | # First pass: handle standard markdown links. 380 | pattern = r'\[([^\]]+)\]\(((?!http)[^)#\s]+)?([#][^)\s]+)?\)' 381 | content = re.sub(pattern, replace_link, content) 382 | 383 | # Second pass: handle already transformed links but with technical names. 384 | pattern = r'\[([A-Z_]+(?:\.(?:json|md))?)\](/[a-z-]+(?:[#][^)\s]+)?)\)' 385 | content = re.sub(pattern, lambda m: f'[{format_link_text(m.group(1))}]{m.group(2)}', content) 386 | 387 | return content 388 | 389 | 390 | def remove_img_tags(content: str) -> str: 391 | """Remove HTML img tags from the content.""" 392 | 393 | print('\n󰋼 Removing img tags...') 394 | 395 | def replace_img(match): 396 | img = match.group(0) 397 | print(f' ⭮ Removing img tag: {img[:120]}') 398 | return '' 399 | 400 | return re.sub( 401 | r']+>', 402 | replace_img, 403 | content 404 | ) 405 | 406 | 407 | def transform_inline_references(content: str) -> str: 408 | """Transform inline file references and URLs to proper format.""" 409 | 410 | print('\n󰋼 Transforming inline references...') 411 | 412 | def replace_reference(match): 413 | path = match.group(1) 414 | 415 | # Remove .md extension if present. 416 | path = path.replace('.md', '') 417 | 418 | # Convert to kebab case and add leading slash. 419 | new_path = '/' + path.lstrip('./').lstrip('/').lower().replace('_', '-') 420 | 421 | print(f' ⭮ {path} → {new_path}') 422 | 423 | return new_path 424 | 425 | # Transform file references like ./DATASET_SCHEMA.md to /dataset-schema. 426 | content = re.sub( 427 | r'(?<=See )\.?/?([A-Z_]+\.md)', 428 | replace_reference, 429 | content 430 | ) 431 | 432 | # Transform TODO references using a separate pattern. 433 | pattern = r"([A-Z_]+\.md)" 434 | content = re.sub( 435 | r'Move to ([A-Z_]+\.md)', 436 | lambda m: f'Move to {replace_reference(re.match(pattern, m.group(1)))}', 437 | content 438 | ) 439 | 440 | return content 441 | 442 | 443 | def transform_markdown_to_mdx(content: str, source_file: Path) -> str: 444 | """Main transformation pipeline to convert markdown to MDX format.""" 445 | 446 | print('\n󰋼 Parsing frontmatter...') 447 | post = frontmatter.loads(content) 448 | 449 | is_readme = source_file.name.lower() == 'readme.md' 450 | 451 | # Apply transformations in sequence. 452 | transformed = remove_table_of_contents(post.content) 453 | transformed = transform_image_references(transformed) 454 | transformed = remove_picture_components(transformed) 455 | transformed = transform_astro_blocks(transformed) 456 | transformed = add_github_header(transformed, is_readme) 457 | transformed = remove_bold_formatting(transformed) 458 | transformed = transform_schema_links(transformed) 459 | transformed = transform_internal_links(transformed) 460 | transformed = transform_inline_references(transformed) 461 | transformed = remove_html_comments(transformed) 462 | transformed = remove_img_tags(transformed) 463 | 464 | print('\n󰋼 Combining with Astro imports...') 465 | return f'{ASTRO_IMPORTS}\n\n{transformed}' 466 | 467 | 468 | def get_target_path(source_path: Path) -> Path: 469 | """Convert source path to target path using the required transformations.""" 470 | 471 | # Get relative path from source root. 472 | rel_path = source_path.relative_to(SOURCE_ROOT) 473 | 474 | # Transform filename. 475 | stem = rel_path.stem.lower().replace('_', '-') 476 | new_name = f"{stem}.mdx" 477 | 478 | # Construct target path. 479 | if source_path.name == 'README.md': 480 | # Special case for README.md -> index.mdx. 481 | return TARGET_ROOT / 'src/content/pages/index.mdx' 482 | else: 483 | # For files in pages directory. 484 | return TARGET_ROOT / 'src/content/pages' / new_name 485 | 486 | 487 | def process_files(): 488 | """Main function to process all markdown files.""" 489 | 490 | try: 491 | # Find all markdown files to process. 492 | source_files = [ 493 | Path(p) for p in [ 494 | *glob.glob(str(SOURCE_ROOT / '*.md')), # root md files 495 | *glob.glob(str(SOURCE_ROOT / 'pages/*.md')) # files in pages directory 496 | ] 497 | 498 | if should_process_file(Path(p)) # filter out ignored files 499 | ] 500 | 501 | print(f'\n󰋼 Found {len(source_files)} markdown files to process') 502 | 503 | for source_file in source_files: 504 | target_file = get_target_path(source_file) 505 | print(f'\n󰋼 Processing: {source_file.name} → {target_file.name}') 506 | 507 | # Read source content. 508 | print(f' Reading source file: {source_file}') 509 | with open(source_file, 'r', encoding='utf-8') as f: 510 | content = f.read() 511 | print(f' Source file size: {len(content)} bytes') 512 | 513 | # Transform content. 514 | print('\n󰋼 Transforming content...') 515 | transformed_content = transform_markdown_to_mdx(content, source_file) 516 | print(f' ⭮ {len(transformed_content)} bytes') 517 | 518 | # Write target file. 519 | print(f'\n󰋼 Writing target file: {target_file}') 520 | os.makedirs(target_file.parent, exist_ok=True) 521 | with open(target_file, 'w', encoding='utf-8') as f: 522 | f.write(transformed_content) 523 | print(f' ⭮ {source_file.name} → {target_file.name}') 524 | 525 | print('\n󰋼 Formatting MDX files...') 526 | os.system('npm run format-sync') 527 | 528 | print('\n Done') 529 | 530 | except Exception as error: 531 | print('\n❌ Error processing files:', str(error)) 532 | sys.exit(1) 533 | 534 | 535 | if __name__ == '__main__': 536 | process_files() 537 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "󰋼 Setting up development environment..." 4 | 5 | # Create virtual environment if it doesn't exist. 6 | if [ ! -d ".venv" ]; then 7 | echo -e "\n󰋼 Creating Python virtual environment..." 8 | python3 -m venv .venv 9 | fi 10 | 11 | # Activate virtual environment. 12 | echo -e "\n󰋼 Activating virtual environment..." 13 | source .venv/bin/activate || source .venv/Scripts/activate 14 | 15 | # Install/upgrade pip and dependencies. 16 | echo -e "\n󰋼 Installing Python dependencies..." 17 | python3 -m pip install --upgrade pip 18 | python3 -m pip install -r requirements.txt 19 | 20 | # Make scripts executable. 21 | echo -e "\n󰋼 Making scripts executable..." 22 | chmod +x scripts/*.sh 23 | 24 | echo -e "\n󰋼 Setup complete! Activate the virtual environment with:" 25 | echo "source .venv/bin/activate # Unix/Mac" 26 | echo ".venv\\Scripts\\activate # Windows" 27 | -------------------------------------------------------------------------------- /scripts/test-sync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORK_DIR="sync" 4 | 5 | echo "󰋼 Starting sync test..." 6 | 7 | rm -rf $WORK_DIR 8 | mkdir -p $WORK_DIR 9 | cd $WORK_DIR 10 | 11 | if [ ! -d "source" ]; then 12 | mkdir source 13 | cp -r ../{pages,*.md} source/ 14 | echo -e "\n Source files copied" 15 | else 16 | echo -e "\n󰋼 Using existing source directory" 17 | fi 18 | 19 | echo -e "\n\n" 20 | 21 | if [ ! -d "target" ]; then 22 | git clone https://github.com/apify/actor-whitepaper-web target 23 | echo -e "\n Target repository cloned" 24 | else 25 | echo -e "\n󰋼 Using existing target directory" 26 | fi 27 | 28 | echo -e "\n\n" 29 | 30 | cd target 31 | git pull origin main 32 | echo -e "\n Target repository updated" 33 | 34 | echo -e "\n\n" 35 | 36 | cd ../.. 37 | 38 | if [ ! -d ".venv" ]; then 39 | python3 -m venv .venv 40 | echo -e "\n Python virtual environment created" 41 | fi 42 | 43 | echo "Current path: $(pwd)" 44 | source .venv/bin/activate 45 | 46 | echo -e "\n\n" 47 | 48 | python3 -m pip install --upgrade pip 49 | python3 -m pip install -r ./requirements.txt 50 | echo -e "\n Python dependencies installed" 51 | 52 | echo -e "\n\n" 53 | echo "Current path: $(pwd)" 54 | python3 scripts/md2mdx.py --source $WORK_DIR/source --target $WORK_DIR/target 55 | echo -e "\n MD to MDX conversion completed" 56 | 57 | echo -e "\n\n" 58 | 59 | cd target 60 | git status 61 | echo -e "\n Target repository status checked" 62 | cd .. 63 | 64 | echo -e "\n\n" 65 | 66 | echo -e "\n Check changes in ${WORK_DIR}/target" 67 | 68 | deactivate 69 | 70 | echo -e "\n󰋼 Done" 71 | -------------------------------------------------------------------------------- /sync-pr-template.txt: -------------------------------------------------------------------------------- 1 | Automated sync of Whitepaper content to MDX format. 2 | 3 | This PR was automatically generated by the sync workflow at https://github.com/apify/actor-whitepaper. 4 | 5 | - [ ] Review content changes 6 | - [ ] Check MDX formatting 7 | --------------------------------------------------------------------------------