├── .editorconfig
├── .github
    └── workflows
    │   └── sync-to-astro.yml
├── .gitignore
├── LICENSE.md
├── README.md
├── img
    ├── apify-actor-drawing.png
    ├── apify-store.png
    └── screenshot-taker-input.png
├── package-lock.json
├── package.json
├── pages
    ├── ACTOR_FILE.md
    ├── DATASET_SCHEMA.md
    ├── IDEAS.md
    ├── INPUT_SCHEMA.md
    ├── KEY_VALUE_STORE_SCHEMA.md
    ├── OUTPUT_SCHEMA.md
    ├── REQUEST_QUEUE_SCHEMA.md
    └── tmp_schema_experiments
    │   ├── amazon_scraper
    │       └── .actor
    │       │   ├── ACTOR.json
    │       │   ├── INPUT_SCHEMA.json
    │       │   └── OUTPUT_SCHEMA.json
    │   ├── dataset-viewer
    │       └── .actor
    │       │   └── OUTPUT_SCHEMA.json
    │   └── google_search_scraper
    │       └── .ACTOR
    │           ├── ACTOR.json
    │           ├── INPUT_SCHEMA.json
    │           ├── OUTPUT.json
    │           ├── OUTPUT_SCHEMA.json
    │           └── schemas
    │               ├── GOOGLE_SERPS_DATASET_SCHEMA.json
    │               └── GOOGLE_SERPS_SCREENSHOTS_KV_STORE_SCHEMA.json
├── requirements.txt
├── scripts
    ├── md2mdx.py
    ├── setup.sh
    └── test-sync.sh
└── sync-pr-template.txt


/.editorconfig:
--------------------------------------------------------------------------------
1 | [*.md]
2 | indent_size = 2
3 | indent_style = space
4 | 


--------------------------------------------------------------------------------
/.github/workflows/sync-to-astro.yml:
--------------------------------------------------------------------------------
  1 | name: Sync Whitepaper to Astro (PR flow)
  2 | 
  3 | # Triggers on push to main when MD files or related assets change.
  4 | on:
  5 |   push:
  6 |     branches:
  7 |       - master
  8 |     paths:
  9 |       - '*.md'
 10 |       - 'pages/**/*.md'
 11 |       - '.github/workflows/sync-to-astro.yml'
 12 |       - 'scripts/**/*.py'
 13 |   workflow_dispatch:  # also allows manual trigger from GitHub UI
 14 | 
 15 | env:
 16 |   TARGET_REPO: "apify/actor-whitepaper-web"
 17 |   TARGET_BRANCH: "sync/whitepaper-updates"
 18 | 
 19 | jobs:
 20 |   sync:
 21 |     name: Sync Whitepaper to Astro (PR flow)
 22 |     runs-on: ubuntu-latest
 23 |     permissions:
 24 |       contents: write      # needed for pushing changes
 25 |       pull-requests: write # needed for creating PRs
 26 | 
 27 |     steps:
 28 |       # Step 1: Clone the source repo (this repo).
 29 |       - name: Checkout source repo
 30 |         uses: actions/checkout@v4
 31 |         with:
 32 |           path: sync/source
 33 | 
 34 |       # Step 2: Clone the target repo (Astro site).
 35 |       - name: Checkout target repo
 36 |         uses: actions/checkout@v4
 37 |         with:
 38 |           repository: ${{ env.TARGET_REPO }}
 39 |           path: sync/target
 40 |           token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
 41 | 
 42 |       # Step 3: Setup Python environment.
 43 |       - name: Setup Python
 44 |         uses: actions/setup-python@v4
 45 |         with:
 46 |           python-version: '3.11'
 47 |           cache: 'pip'
 48 |           cache-dependency-path: sync/source/requirements.txt
 49 | 
 50 |       # Step 4: Install dependencies.
 51 |       - name: Install dependencies
 52 |         run: |
 53 |           python -m pip install --upgrade pip
 54 |           python -m pip install -r sync/source/requirements.txt
 55 |           cp sync/source/package.json . && npm install
 56 | 
 57 |       # Step 5: Run the MD to MDX conversion script.
 58 |       - name: Run sync script
 59 |         run: python sync/source/scripts/md2mdx.py --source sync/source --target sync/target
 60 | 
 61 |       # Step 6: Create or update PR with changes.
 62 |       - name: Create Pull Request
 63 |         env:
 64 |           GH_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
 65 |         run: |
 66 |           cd sync/target
 67 |           git status
 68 | 
 69 |           # Create a unique branch name with timestamp.
 70 |           BRANCH_NAME="sync/whitepaper-updates-$(date +%Y%m%d-%H%M%S)"
 71 |           echo "Using branch: $BRANCH_NAME"
 72 | 
 73 |           git config user.name "github-actions[bot]"
 74 |           git config user.email "github-actions[bot]@users.noreply.github.com"
 75 | 
 76 |           git checkout -b "$BRANCH_NAME"
 77 | 
 78 |           # Only create PR if there are changes.
 79 |           if [[ -n "$(git status --porcelain)" ]]; then
 80 |             echo "Changes detected:"
 81 |             git status --porcelain
 82 | 
 83 |             git add .
 84 |             git commit -m "sync: Update MDX content from Whitepaper"
 85 |             if ! git push -f origin "$BRANCH_NAME"; then
 86 |               echo "Failed to push changes"
 87 |               exit 1
 88 |             fi
 89 | 
 90 |             # Create the PR using GitHub CLI.
 91 |             gh pr create \
 92 |               --title "sync: Update MDX content from Whitepaper" \
 93 |               --body-file ../source/sync-pr-template.txt \
 94 |               --base main \
 95 |               --head "$BRANCH_NAME" \
 96 |               --label "sync" \
 97 |               --assignee ${{ github.actor }}
 98 |           else
 99 |             echo "No changes detected in git status --porcelain"
100 |             echo "Full directory contents of src/content/pages:"
101 |             ls -la src/content/pages/
102 |           fi


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | node_modules
 3 | sync
 4 | 
 5 | # Python
 6 | .venv
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | .Python
11 | pip-log.txt
12 | pip-delete-this-directory.txt
13 | .pytest_cache/
14 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |    Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |    stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |    that You distribute, all copyright, patent, trademark, and
102 |    attribution notices from the Source form of the Work,
103 |    excluding those notices that do not pertain to any part of
104 |    the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |    distribution, then any Derivative Works that You distribute must
108 |    include a readable copy of the attribution notices contained
109 |    within such NOTICE file, excluding those notices that do not
110 |    pertain to any part of the Derivative Works, in at least one
111 |    of the following places: within a NOTICE text file distributed
112 |    as part of the Derivative Works; within the Source form or
113 |    documentation, if provided along with the Derivative Works; or,
114 |    within a display generated by the Derivative Works, if and
115 |    wherever such third-party notices normally appear. The contents
116 |    of the NOTICE file are for informational purposes only and
117 |    do not modify the License. You may add Your own attribution
118 |    notices within Derivative Works that You distribute, alongside
119 |    or as an addendum to the NOTICE text from the Work, provided
120 |    that such additional attribution notices cannot be construed
121 |    as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 | Copyright 2024 Apify Technologies s.r.o.
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/img/apify-actor-drawing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/apify-actor-drawing.png


--------------------------------------------------------------------------------
/img/apify-store.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/apify-store.png


--------------------------------------------------------------------------------
/img/screenshot-taker-input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify/actor-whitepaper/442c057fbc734e173178f81301fd4096876dda0b/img/screenshot-taker-input.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "actor-specs",
 3 |   "version": "0.0.1",
 4 |   "description": "This is just to generate table of content in Markdown files",
 5 |   "devDependencies": {
 6 |     "markdown-link-check": "^3.13.6",
 7 |     "markdown-toc": "^1.2.0",
 8 |     "prettier": "^3.5.1",
 9 |     "prettier-plugin-astro": "^0.14.1",
10 |     "prettier-plugin-astro-organize-imports": "^0.4.11",
11 |     "prettier-plugin-css-order": "^2.1.2",
12 |     "prettier-plugin-jsdoc": "^1.3.2",
13 |     "prettier-plugin-organize-attributes": "^1.0.0",
14 |     "prettier-plugin-organize-imports": "^4.1.0",
15 |     "prettier-plugin-tailwindcss": "^0.6.11"
16 |   },
17 |   "scripts": {
18 |     "build-toc": "./node_modules/.bin/markdown-toc README.md -i --maxdepth 3 && ./node_modules/.bin/markdown-link-check README.md",
19 |     "test-sync": "chmod +x scripts/*.sh && ./scripts/test-sync.sh",
20 |     "format-sync": "prettier --write --config sync/target/.prettierrc.cjs --ignore-path false --plugin=prettier-plugin-astro --plugin=prettier-plugin-organize-imports --plugin=prettier-plugin-organize-attributes --plugin=prettier-plugin-astro-organize-imports --plugin=prettier-plugin-css-order --plugin=prettier-plugin-tailwindcss --plugin=prettier-plugin-jsdoc \"./sync/target/src/content/pages/**/*.mdx\""
21 |   },
22 |   "repository": {
23 |     "type": "git",
24 |     "url": "https://github.com/apifytech/actor-scraper"
25 |   },
26 |   "author": {
27 |     "name": "Jan Curn",
28 |     "email": "jan@apify.com",
29 |     "url": "https://apify.com/jancurn"
30 |   },
31 |   "license": "Apache-2.0"
32 | }
33 | 


--------------------------------------------------------------------------------
/pages/ACTOR_FILE.md:
--------------------------------------------------------------------------------
  1 | # Actor file specification
  2 | 
  3 | This JSON file must be present at `.actor/actor.json` and defines core properties of a single web Actor.
  4 | 
  5 | The file contains a single JSON object with the following properties:
  6 | 
  7 | ```jsonc
  8 | {
  9 |   // Required field, indicates that this is an Actor definition file and the specific version of the Actor specification.
 10 |   "actorSpecification": 1,
 11 |   
 12 |   // Required "technical" name of the Actor, must be a DNS hostname-friendly text.
 13 |   "name": "google-search-scraper",
 14 | 
 15 |   // Human-friendly name and description of the Actor.
 16 |   "title": "Google Search Scraper",
 17 |   "description": "A 200-char description",
 18 | 
 19 |   // Required, indicates the version of the Actor. Since actor.json file is commited to Git, you can have different Actor
 20 |   // versions in different branches.
 21 |   "version": "0.0",
 22 | 
 23 |   // Optional tag that is applied to the builds of this Actor. If omitted, it defaults to "latest".
 24 |   "buildTag": "latest",
 25 |   
 26 |   // An optional object with environment variables expected by the Actor.
 27 |   // Secret values are prefixed by @ and their actual values need to be registered with the CLI, for example:
 28 |   // $ apify secrets add mySecretPassword pwd1234
 29 |   "environmentVariables": {
 30 |     "MYSQL_USER": "my_username",
 31 |     "MYSQL_PASSWORD": "@mySecretPassword"
 32 |   },
 33 |   
 34 |   // Optional field. If true, the Actor indicates it can be run in the Standby mode,
 35 |   // to get started and be kept alive by the system to handle incoming HTTP REST requests by the Actor's web server.
 36 |   "usesStandbyMode": true,
 37 |  
 38 |   // An optional metadata object enabling implementations to pass arbitrary additional properties.
 39 |   // The properties and their values must be strings.
 40 |   "labels": {
 41 |     "something": "bla bla"
 42 |   },
 43 | 
 44 |   // Optional minimum and maximum memory for running the Actor.
 45 |   "minMemoryMbytes": 128,
 46 |   "maxMemoryMbytes": 4096,
 47 |   
 48 |   // Optional link to the Actor Dockerfile.
 49 |   // If omitted, the system looks for "./Dockerfile" or "../Dockerfile"
 50 |   "dockerfile": "./Dockerfile",
 51 |   
 52 |   // Optional link to the Actor README file in Markdown format.
 53 |   // If omitted, the system looks for "./ACTOR.md" and "../README.md"
 54 |   "readme": "./README.md",
 55 | 
 56 |   // Optional link to the Actor changelog file in Markdown format.
 57 |   "changelog": "../../../shared/CHANGELOG.md",
 58 |   
 59 |   // Optional link to Actor input or output schema file, or inlined schema object,
 60 |   // which is a JSON schema with our extensions. For details see ./INPUT_SCHEMA.md or ./OUTPUT_SCHEMA.md, respectively.
 61 |   // BACKWARDS COMPATIBILITY: "inputSchema" used to be called "input", all implementations should support this.
 62 |   "inputSchema": "./input_schema.json",
 63 |   "outputSchema": "./output_schema.json",
 64 |   
 65 |   // Optional path to Dataset or Key-value Store schema file or inlined schema object for the Actor's default dataset or key-value store. 
 66 |   // For detail, see ./DATASET_SCHEMA.md or ./KEY_VALUE_STORE_SCHEMA.md, respectively.
 67 |   // BACKWARDS COMPATIBILITY: "datasetSchema" used to be "storages.keyValueStore" sub-object, all implementations should support this.
 68 |   "datasetSchema": "../shared_schemas/generic_dataset_schema.json",
 69 |   "keyValueStoreSchema": "./key_value_store_schema.json",
 70 |    
 71 |   // Optional path or inlined schema object of the Actor's web server in OpenAPI formation.
 72 |   "webServerSchema": "./web_server_openapi.json",
 73 |   
 74 |   // Optional URL path and query parameters to the Model Context Protocol (MCP) server exposed by the Actor web server.
 75 |   // If present, the system knows the Actor provides an MCP server, which can be used by the platform
 76 |   // and integrations to integrate the Actor with various AI/LLM systems.
 77 |   "webServerMcpPath": "/mcp?version=2",
 78 | 
 79 |   // Scripts can be used by tools like the CLI to do certain actions based on the commands you run.
 80 |   // The presence of this object in your Actor config is optional, but we recommend always defining at least the `run` key.
 81 |   "scripts": {
 82 |     // The `run` script is special - it defines *the* way to run your Actor locally. While tools can decide
 83 |     // to implement mechanisms to detect what type of project your Actor is, and how to run it, you can choose to
 84 |     // define this as the source of truth.
 85 |     //
 86 |     // This should be the same command you run as if you were at the root of your Actor when you start it locally.
 87 |     // This can be anything from an npm script, as shown below, to a full chain of commands (ex.: `cargo test && cargo run --release`).
 88 |     //
 89 |     // CLIs may opt to also request this command when initializing a new Actor, or to automatically migrate and add it in the first time
 90 |     // you start the Actor locally.
 91 |     "run": "npm start"
 92 |   }
 93 | }
 94 | ```
 95 | 
 96 | ## Notes
 97 | 
 98 | - The `name` doesn't contain the developer username, so that the Actor can be easily deployed
 99 |   to any user account. This is useful for tutorials and examples, as well as
100 |   pull requests done externally to create Actors from existing source code files
101 |   owned by external developers
102 |   (the developer might not have Apify account yet, and we might want to show them deployment
103 |   to some testing account).
104 |   Note that `apify push` has option `--target=eva/my-actor:0.0` that allows
105 |   deployment of the Actor under a different user account, using permissions
106 |   and personal API token of the current user.
107 |   We should also add options to override only parts of this,
108 |   like `--target-user` (ID or username), `--name`, `--build-tag` and `--version`,
109 |   it would be useful e.g. in CI for beta versions etc.
110 | - Note that `version` and `buildTag` are shared across Actor deployments to
111 |   all user accounts, similarly as with software libraries,
112 |   and hence they are part of `actor.json`.
113 | - The `dockerfile` property points to a Dockerfile that is to be used to build the
114 |   Actor image. If not present, the system looks for Dockerfile in the `.actor` directory
115 |   and if not found, then in Actor's top-level
116 |   directory. This setting is useful if the source code repository has some
117 |   other Dockerfile in the top-level directory, to separate Actor Docker image from the
118 |   other one. Note that paths in Dockerfile are ALWAYS relative to the Dockerfile's location.
119 |   When calling `apify run`, the system runs the Actor using the Dockerfile.
120 | - When calling `actor push` and the `title` or `description` are already set
121 |   on the Actor (maybe SEO-optimized versions from copywriter),
122 |   by default we do not overwrite them
123 |   unless `apify push` is called with options `--force-title` or `--force-description`.
124 | 
125 | ## Changes from the legacy `apify.json` file
126 | 
127 | The `.actor/actor.json` replaces the legacy `apify.json` file. Here are main changes from the previous version:
128 | 
129 | - We removed the `template` property as it's not needed for anything, it only stored the original template
130 | - There's a new `title` field for a human-readable name of the Actor.
131 |   We're moving towards having human-readable names shown for Actors everywhere,
132 |   so it makes sense to define `title` directly in the source code.
133 | - Similarly, we added `description` for the short description of what the Actor does.
134 | - `env` was renamed to `environmentVariables` for more clarity. `apify build` or `apify run`
135 |   could have an option `--apply-env-vars-to-build` like we have it on platform.
136 | - The `dockerfile` and `readme` directives are optional, the system falls back to reasonable
137 |   defaults, first in `.actor` directory and then in the top-level directory.
138 | - `scripts` section was added
139 | 


--------------------------------------------------------------------------------
/pages/DATASET_SCHEMA.md:
--------------------------------------------------------------------------------
  1 | # Dataset schema file specification 1.0
  2 | 
  3 | Dataset storage enables you to sequentially store and retrieve data records, in various formats.
  4 | Each Actor run is assigned its own dataset, which is created when the first item is stored to it.
  5 | Datasets usually contain results from web scraping, crawling or data processing jobs.
  6 | The data can be visualized as a table where each object is a row and its attributes are the columns.
  7 | The data can be exported in JSON, CSV, XML, RSS, Excel, or HTML formats.
  8 | 
  9 | The specification is also at https://docs.apify.com/platform/actors/development/actor-definition/output-schema 
 10 | 
 11 | Dataset can be assigned a schema which describes:
 12 | 
 13 | - Content of the dataset, i.e., the schema of objects that are allowed to be added
 14 | - Different views on how we can look at the data, aka transformations
 15 | - Visualization of the View using predefined components (grid, table, ...), which improves the run view interface at Apify Console
 16 |   and also provides a better interface for datasets shared by Apify users
 17 | 
 18 | <img src="https://user-images.githubusercontent.com/594801/147474979-a224008c-8cba-43a6-8d2e-c24f6b0d5b37.png" width="500">
 19 | 
 20 | <!-- ASTRO: <Picture src={illuDatasetSchema} alt="Dataset schema" formats={['avif', 'webp']} /> -->
 21 | 
 22 | ## Basic properties
 23 | 
 24 | - Storage is immutable. I.e., if you want to change the structure, then you need to create a new dataset.
 25 | - Its schema is weak. I.e., you can always push their additional properties, but schema will ensure that all the listed once are there with a correct type. This is to make Actors more compatible, i.e., some Actor expects dataset to contain certain fields but does not care about the additional ones.
 26 | 
 27 | There are two ways how to create a dataset with schema:
 28 | 1. User can start the Actor that has dataset schema linked from its
 29 | [OUTPUT_SCHEMA.json](./OUTPUT_SCHEMA.md)
 30 | 2. Or user can do it pragmatically via API (for empty dataset) by
 31 |     - either by passing the schema as payload to [create dataset](https://docs.apify.com/api#/reference/datasets/dataset-collection/create-dataset) API endpoint.
 32 |     - or using the SDK:
 33 | 
 34 |     ```js
 35 |     const dataset = await Apify.openDataset('my-new-dataset', { schema });
 36 |     ```
 37 | 
 38 | By opening an **existing** dataset with `schema` parameter, the system ensures that you are opening a dataset that is compatible with the Actor as otherwise, you get an error:
 39 | 
 40 | ```
 41 | Uncaught Error: Dataset schema is not compatible with the provided schema
 42 | ```
 43 | 
 44 | ## Structure
 45 | 
 46 | ```jsonc
 47 | {
 48 |     "actorDatasetSchemaVersion": 1,
 49 |     "title": "E-shop products",
 50 |     "description": "Dataset containing the whole product catalog including prices and stock availability.",
 51 | 
 52 |     // A JSON schema object describing the dataset fields, with our extensions: the "title", "description", and "example" properties.
 53 |     // "example" is used to generate code and API examples for the Actor output.
 54 |     // For details, see https://docs.apify.com/platform/actors/development/actor-definition/dataset-schema
 55 |     "fields": {
 56 |         "type": "object",
 57 |         "properties": {
 58 |             "title": {
 59 |                 "type": "string",
 60 |                 "description": "The name of the results",
 61 |             },
 62 |             "imageUrl": {
 63 |                 "type": "string",
 64 |                 "description": "Function executed for each request",
 65 |             },
 66 |             "priceUsd": {
 67 |                 "type": "integer",
 68 |                 "description": "Price of the item",
 69 |             },
 70 |             "manufacturer": {
 71 |                 "type": "object",
 72 |                 "properties": {
 73 |                     "title": { ... }, 
 74 |                     "url": { ... },
 75 |                 }
 76 |             },
 77 |             ...
 78 |         },
 79 |         "required": ["title"],
 80 |     },
 81 |   
 82 |     // Define the ways how to present the Dataset to users
 83 |     "views": {
 84 |         "overview": {
 85 |             "title": "Products overview",
 86 |             "description": "Displays only basic fields such as title and price",
 87 |             "transformation": {
 88 |                 "flatten": ["stockInfo"],
 89 |                 "fields": [
 90 |                     "title",
 91 |                     "imageUrl",
 92 |                     "variants"
 93 |                 ]
 94 |             },
 95 |             "display": {
 96 |                 "component": "table",
 97 |                 "properties": {
 98 |                     "title": {
 99 |                       "label": "Title"
100 |                     },                           
101 |                     "imageUrl": {
102 |                         "label": "Image",
103 |                         "format": "image" // optional, in this case the format is overridden to show "image" instead of image link "text". "image" format only works with .jpeg, .png or other image format urls.
104 |                     },
105 |                     "stockInfo.availability": {
106 |                         "label": "Availability"
107 |                     }
108 |                 }
109 |             }
110 |         },
111 |         "productVariants": {
112 |             "title": "Product variants",
113 |             "description": "Each product expanded into item per variant",
114 |             "transformation": {
115 |                 "fields": [
116 |                     "title",
117 |                     "price",
118 |                     "productVariants"
119 |                 ],
120 |                 "unwind": "productVariants"
121 |             },
122 |             "display": {
123 |                 // Simply renders all the available fields. 
124 |                 // This component is used by default when no display is specified.
125 |                 "component": "table"
126 |             }
127 |         }
128 |     },
129 | }
130 | ```
131 | 
132 | ## DatasetSchema object definition
133 | 
134 | | Property           | Type                         | Required | Description                                                                                        |
135 | | ------------------ | ---------------------------- | -------- | -------------------------------------------------------------------------------------------------- |
136 | | actorSpecification | integer                      | true     | Specifies the version of dataset schema <br/>structure document. <br/>Currently only version 1 is available. |
137 | | fields             | JSON schema | true     | JSON schema object with more formats in the future.             |
138 | | views              | [DatasetView]          | true     | An array of objects with a description of an API <br/>and UI views.                                                  |
139 | 
140 | ### JSON schema
141 | 
142 | Items of a dataset can be described by a JSON schema definition, passed into the `fields` property.
143 | The Actor system then ensures that each records added to the dataset complies with the provided schema.
144 | 
145 | ```jsonc
146 | {
147 |   "type": "object",
148 |   "required": [
149 |     "name",
150 |     "email"
151 |   ],
152 |   "properties": {
153 |     "id": {
154 |       "type": "string"
155 |     },
156 |     "name": {
157 |       "type": "string"
158 |     },
159 |     "email": {
160 |       "type": "string"
161 |     },
162 |     "arr": {
163 |       "type": "array",
164 |       "items": {
165 |         "type": "object",
166 |         "required": [],
167 |         "properties": {
168 |           "site": {
169 |             "type": "string"
170 |           },
171 |           "url": {
172 |             "type": "string"
173 |           }
174 |         }
175 |       }
176 |     }
177 |   }
178 | }
179 | ```
180 | 
181 | 
182 | ### DatasetView object definition
183 | 
184 | | Property       | Type                      | Required | Description                                                                                           |
185 | | -------------- | ------------------------- | -------- | ----------------------------------------------------------------------------------------------------- |
186 | | title          | string                    | true     | The title is visible in UI in the Output tab <br/>as well as in the API.                                       |
187 | | description    | string                    | false    | The description is only available in the API response. <br/>The usage of this field is optional.                       |
188 | | transformation | ViewTransformation object | true     | The definition of data transformation <br/>is applied when dataset data are loaded from <br/>Dataset API. |
189 | | display        | ViewDisplay object        | true     | The definition of Output tab UI visualization.                                                        |
190 | 
191 | ### ViewTransformation object definition
192 | 
193 | | Property | Type     | Required | Description                                                                                                                                                                                                         |
194 | | -------- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
195 | | fields   | string[] | true     | Selects fields that are going to be presented in the output. <br/>The order of fields matches the order of columns <br/>in visualization UI. In case the fields value <br/>is missing, it will be presented as “undefined” in the UI. |
196 | | unwind   | string   | false    | Deconstructs nested children into parent object, <br/>e.g.: with unwind:[”foo”], the object `{”foo”:{”bar”:”hello”}}`  <br/> is turned into `{’bar”:”hello”}`.                                                                     |
197 | | flatten  | string[] | false    | Transforms nested object into flat structure. <br/>eg: with flatten:[”foo”] the object `{”foo”:{”bar”:”hello”}}` <br/> is turned into `{’foo.bar”:”hello”}`.                                                                    |
198 | | omit     | string   | false    | Removes the specified fields from the output. <br/>Nested fields names can be used there as well.                                                                                                                           |
199 | | limit    | integer  | false    | The maximum number of results returned. <br/>Default is all results.                                                                                                                                                         |
200 | | desc     | boolean  | false    | By default, results are sorted in ascending based <br/>on the write event into the dataset. desc:true param <br/>will return the newest writes to the dataset first.                                                                      |
201 | 
202 | ### ViewDisplay object definition
203 | 
204 | | Property   | Type                                                                                                               | Required | Description                                                                                                                  |
205 | | ---------- | ------------------------------------------------------------------------------------------------------------------ | -------- | ---------------------------------------------------------------------------------------------------------------------------- |
206 | | component  | string                                                                                                             | true     | Only component “table” is available.                                                                                         |
207 | | properties |  Object | false    | Object with keys matching the `transformation.fields` <br/> and ViewDisplayProperty as values. In case properties are not set <br/>the table will be rendered automatically with fields formatted as Strings, <br/>Arrays or Objects. |
208 | 
209 | ### ViewDisplayProperty object definition
210 | 
211 | | Property | Type                                                    | Required | Description                                                                                    |
212 | | -------- | ------------------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------- |
213 | | label    | string                                                  | false    | In case the data are visualized as in Table view. <br/>The label will be visible table column’s header. |
214 | | format   | enum(text, number, date, link, <br/>boolean, image, array, object) | false    | Describes how output data values are formatted <br/>in order to be rendered in the output tab UI.       |
215 | 


--------------------------------------------------------------------------------
/pages/IDEAS.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Sandbox for various ideas
 3 | 
 4 | Here you can find random ideas and notes, in no particular order, relevance, or promise they will be implemented.
 5 | 
 6 | ## TODOs
 7 | 
 8 | 
 9 | - Add ideas for the permission system
10 |   - Note from Marek regarding permission:
11 |   - Just a note on this, I was thinking about how this could be done systematically, so dropping the notes here:
12 |   - By default, the Actor should have following permissions that the user would accept when running the Actor for the first time:
13 |       - Write to all the default + named storages linked in the output schema
14 |       - Proxy - simply because we want all the traffic to run thru the proxy so we don't want Actors scraping directly
15 |   - In `actor.json` the Actor could request additional permissions, basically anything from [permissions](https://docs.apify.com/access-rights/list-of-permissions#actor-task), for example, `DATASET.READ` to be able to read all the datasets or `SCHEDULER.WRITE` to manage schedules
16 |   There is one tricky part:
17 |     - If an Actor needs to `.call()` other Actors then basically the user must give him full permissions. Otherwise, the Actor would have to list all the other Actors it's going to call and the user would have to accept all the permissions needed in recursive calls.
18 |   Extra question:
19 |     - What to do if the new version of the Actor requires more permissions? We should probably require the author to increase a major version and keep users on the old build + email them to accept the updated permissions.
20 | 
21 | - We should make env vars independent of Apify, i.e. start them with `ACTOR_`, rather then `APIFY_`
22 | 
23 | - To storages, add info about atomic rename, e.g. `setName` function, and link to other operations...
24 | 
25 | - Maybe add `Actor.getThisRun()` function to return run object of the current Actor. Not sure about use case...
26 | 
27 | - Figure the push/build workflow, see https://github.com/apify/actor-specs/pull/7/files#r997020215 
28 |    / https://github.com/apify/actor-specs/pull/7#pullrequestreview-1144097598 
29 |    how should that work with
30 | 
31 | - Would be nice to have an API that would send a message to a run and the run would get it as `.on('message', (msg) => { ... })`. Would save people from implementing their own servers in Actors.
32 |   It would make it easier to orchestrate Actors. Currently it's a bit painful to create a "master" Actor and then "workers" to process some workloads. But it could probably be achieved with a queue. if it were distributed and generic.
33 |    Explain why is this better than live-view HTTP API
34 | 
35 | 
36 | - NOTE: BTW we have a new API v3 doc with ideas for changes in API https://www.notion.so/apify/API-v3-6fcd240d9621427f9650b741ec6fa06b ?
37 | 
38 | - For DATASET schema, In future versions let's consider referencing schema using URL, for now let's keep it simple
39 | 
40 | 
41 | 
42 | ### Pipe result of an Actor to another (aka chaining)
43 | 
44 | Actor can start other Actors and
45 | pass them its own dataset or key-value store.
46 | For example, the main Actor can produce files
47 | and the spawned others can consume them, from the same storages.
48 | 
49 | In the future, we could let datasets be cleaned up from the beginning,
50 | effectively creating a pipe, with custom rolling window.
51 | Webhooks can be attached to storage operations,
52 | and so launch other Actors to consume newly added items or files.
53 | 
54 | #### UNIX equivalent
55 | 
56 | ```bash
57 | $ ls -l | grep "something" | wc -l
58 | ```
59 | 
60 | **TODO (@jancurn):** **Move to IDEAS.md** We could have a special CLI support for creating Actor chains using pipe operator,
61 | like this:
62 | 
63 | ```
64 | $ apify call apify/google-search-scraper | apify call apify/send-email queryTerms="aaa\nbbb"
65 | ```
66 | 
67 | Note from Marek:
68 | Here we will need some way how to map outputs from old Actor to inputs of the following Actor, perhaps we could pipeline thru some utility like [jq](https://stedolan.github.io/jq/tutorial/)
69 | or use some mapping like:
70 | 
71 | ```
72 | --input-dataset-id="$output.defaultDatasetId" --dataset-name="xxx"
73 | ```
74 | 
75 | Note from Ondra:
76 | I tried to write a JS example for piping, but figured that piping is not really aligned with how Actors work, because piping assumes the output of one program is immediately processed by another program. Actors can produce output like this, but they can't process input like this. Input is provided only once, when the Actor starts. Unless we consider e.g. request queue as input. We will have to think about this a bit differently.
77 | 
78 | Note from Jan:
79 | Indeed, the flow is to start one Actor, and pass one of it's storages as default to the other newly started Actor. If we had a generic Queue, it could be used nicely for these use case. I'm adding these notes to the doc, so that we can get back to them later.
80 | 
81 | Jan: I'd get rid of the Request queue from Actor specification, and kept it as Apify's extension only.
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/pages/INPUT_SCHEMA.md:
--------------------------------------------------------------------------------
  1 | # Actor input schema file specification 1.0
  2 | 
  3 | This JSON file defines the schema and description of the input object accepted by the
  4 | Actor (see [Input](../README.md#input) for details).
  5 | The file is referenced from the main [Actor file (.actor/actor.json)](ACTOR_FILE.md) using the `input` directive,
  6 | and it is typically stored in `.actor/input_schema.json`.
  7 | 
  8 | The file is a JSON schema with our extensions describing a single Actor input object
  9 | and its properties, including documentation, default value, and user interface definition.
 10 | 
 11 | **For full reference, see [Input schema specification](https://docs.apify.com/platform/actors/development/actor-definition/input-schema/specification/v1) in Apify documentation.**
 12 | 
 13 | <!-- TODO: Move the full specs including JSON meta schema to this repo -->
 14 | <!-- TODO: Consider renaming "editor" values to camelCase, for consistency -->
 15 | 
 16 | ## Example Actor input schema
 17 | 
 18 | ```jsonc
 19 | {
 20 |   "actorInputSchemaVersion": 1,
 21 | 
 22 |   "title": "Input schema for an Actor",
 23 |   "description": "Enter the start URL(s) of the website(s) to crawl, configure other optional settings, and run the Actor to crawl the pages and extract their text content.",
 24 |   "type": "object",
 25 |     
 26 |   "properties": {
 27 | 
 28 |     "startUrls": {
 29 |       "title": "Start URLs",
 30 |       "type": "array",
 31 |       "description": "One or more URLs of the pages where the crawler will start. Note that the Actor will additionally only crawl sub-pages of these URLs. For example, for the start URL `https://www.example.com/blog`, it will crawl pages like `https://example.com/blog/article-1`, but will skip `https://example.com/docs/something-else`.",
 32 |       "editor": "requestListSources",
 33 |       "prefill": [{ "url": "https://docs.apify.com/" }]
 34 |     },
 35 | 
 36 |     // The input value is another Dataset. The system can generate an UI to make it easy to select the dataset.
 37 |     "processDatasetId": {
 38 |       "title": "Input dataset",
 39 |       "type": "string",
 40 |       "resourceType": "dataset",
 41 |       "description": "Dataset to be processed by the Actor",
 42 |       // Optional link to dataset schema, used by the system to validate the input dataset
 43 |       "schema": "./input_dataset_schema.json"
 44 |     },
 45 |     
 46 |     "screenshotsKeyValueStoreId": {
 47 |       "title": "Screenshots to process",
 48 |       "type": "string",
 49 |       "resourceType": "keyValueStore",
 50 |       "description": "Screenshots to be compressed",
 51 |       "schema": "./input_key_value_store_schema.json"
 52 |     },
 53 |     
 54 |     "singleFileUrl": {
 55 |       "title": "Some file",
 56 |       "type": "string",
 57 |       "editor": "fileupload",
 58 |       "description": "Screenshots to be compressed",
 59 |       "schema": "./input_key_value_store_schema.json"
 60 |     },
 61 |   
 62 |     "crawlerType": {
 63 |       "sectionCaption": "Crawler settings",
 64 |       "title": "Crawler type",
 65 |       "type": "string",
 66 |       "enum": ["playwright:chrome", "cheerio", "jsdom"],
 67 |       "enumTitles": ["Headless web browser (Chrome+Playwright)", "Raw HTTP client (Cheerio)", "Raw HTTP client with JS execution (JSDOM) (experimental!)"],
 68 |       "description": "Select the crawling engine:\n- **Headless web browser** (default) - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.",
 69 |       "default": "playwright:chrome"
 70 |     },
 71 | 
 72 |     "maxCrawlDepth": {
 73 |       "title": "Max crawling depth",
 74 |       "type": "integer",
 75 |       "description": "The maximum number of links starting from the start URL that the crawler will recursively descend. The start URLs have a depth of 0, the pages linked directly from the start URLs have a depth of 1, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to 0, the Actor will only crawl start URLs.",
 76 |       "minimum": 0,
 77 |       "default": 20
 78 |     },
 79 |     
 80 |     "maxCrawlPages": {
 81 |       "title": "Max pages",
 82 |       "type": "integer",
 83 |       "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
 84 |       "minimum": 0,
 85 |       "default": 9999999
 86 |     }
 87 |      
 88 |   }
 89 | }
 90 | ```
 91 | 
 92 | ## Random notes
 93 | 
 94 | 
 95 | We could also add an `actor` resource type. The use case could be for example a testing Actor with three inputs:
 96 | - Actor to be tested
 97 | - test function containing for example Jest unit test over the output
 98 | - input for the Actor
 99 | 
100 | ...and the testing Actor would call the given Actor with a given output and in the end execute tests if the results are correct.
101 | 
102 | 


--------------------------------------------------------------------------------
/pages/KEY_VALUE_STORE_SCHEMA.md:
--------------------------------------------------------------------------------
  1 | # Key-value store schema file specification [work in progress]
  2 | 
  3 | This JSON file should contain schema for files stored in the key-value store,
  4 | defining their name, format, or content type.
  5 | 
  6 | **BEWARE: This is currently not implemented yet and subject to change.**
  7 | 
  8 | ## Basic properties
  9 | 
 10 | Key-value store schema has two main use cases described in the following examples:
 11 | 
 12 | 1. Some Actors such as [Instagram scraper](https://apify.com/jaroslavhejlek/instagram-scraper)
 13 | store multiple types of files into the key-value store. Let's say the scraper stores images and user pictures.
 14 | So for each of these, we would define a prefix group called collection and allow the user to list images from a single collection in both the
 15 | UI and API.
 16 | 
 17 | ```jsonc
 18 | {
 19 |    "collections": {
 20 |       "screenshots": {
 21 |             "name": "Post images",
 22 |             "keyPrefix": "images-",
 23 |             "contentTypes": ["image/jpeg", "image/png"]
 24 |        }
 25 |    }
 26 | }
 27 | ```
 28 | 
 29 | 2. Some Actor stores a specific record, and we want to ensure the content type to be HTML and embed it into the run view.
 30 | A good example is [monitoring](https://apify.com/apify/monitoring#check-frequency) Actor that generates HTML report that we would
 31 | like to embed to run view for the user once the monitoring is finished.
 32 | 
 33 | ```jsonc
 34 | {
 35 |     "collections": {
 36 |         "monitoringReport": {
 37 |             "name": "Monitoring report",
 38 |             "description": "HTML page containing monitoring results",
 39 |             "key": "REPORT",
 40 |             "contentTypes": ["text/html"]
 41 |         }
 42 |     }
 43 | }
 44 | ```
 45 | 
 46 | 3. Some Actors store a record that has a specific structure. The structure can be specified using [JSON schema](https://json-schema.org/draft-07).
 47 | Contrary to dataset schema, the record in key-value store represents output that is a single item, instead of a sequence of items. But both approaches use JSON schema to describe the structure.
 48 | 
 49 | ```jsonc
 50 | {
 51 |     "collections": {
 52 |         "monitoringReportData": {
 53 |             "name": "Monitoring report data",
 54 |             "description": "JSON containing the report data",
 55 |             "key": "report-data.json",
 56 |             "contentTypes": ["application/json"],
 57 |             "jsonSchema": {
 58 |                 "$schema": "http://json-schema.org/draft-07/schema#",
 59 |                 "type": "object",
 60 |                 "properties": {
 61 |                     "summary": { "type": "string" },
 62 |                     "totalResults": { "type": "number" }
 63 |                 }
 64 |             } // alternatively "jsonSchema": "./report-schema.json" can be used
 65 |         }
 66 |     }
 67 | }
 68 | ```
 69 | 
 70 | ## Structure
 71 | 
 72 | ```jsonc
 73 | {
 74 |     "actorKeyValueStoreSchemaVersion": 1,
 75 |     "name": "My Instagram backup",
 76 |     "description": "Backup of my Instagram account",
 77 |     
 78 |     "collections": {
 79 |         "postImages": {
 80 |             "name": "Post images",
 81 |             "description": "Contains all Instagram post images",
 82 |             "keyPrefix": "post-image-",
 83 |             "contentTypes": ["image/jpeg", "image/png"]
 84 |         },
 85 | 
 86 |         "profilePicture": {
 87 |             "name": "Profile picture",
 88 |             "key": "profile-picture",
 89 |             "contentTypes": ["image/*"] // Be able to enable all images or text types etc.
 90 |         }
 91 |     }
 92 | }
 93 | ```
 94 | 
 95 | ## API implications
 96 | 
 97 | Enable user to list keys for specific collection:
 98 | 
 99 | ```
100 | https://api.apify.com/v2/key-value-stores/storeId/keys?collection=postImages&exclusiveStartKey=xxx
101 | ```
102 | 
103 | In addition to this user will be able to list by prefix directly:
104 | 
105 | ```
106 | https://api.apify.com/v2/key-value-stores/storeId/keys?prefix=post-images-
107 | ```
108 | 


--------------------------------------------------------------------------------
/pages/OUTPUT_SCHEMA.md:
--------------------------------------------------------------------------------
  1 | # Actor output schema file specification 1.0 [work in progress]
  2 | 
  3 | This JSON file defines the schema of the [output](../README.md#output) object produced by a web Actor.
  4 | The file is referenced from the main [Actor file](./ACTOR_FILE.md) using the `output` property,
  5 | and it is typically stored in `.actor/output_schema.json`.
  6 | 
  7 | The format is a JSON Schema with our extensions, describing a single object.
  8 | 
  9 | The output schema is used by the system to generate the
 10 | output JSON object,
 11 | whose fields corresponding to `properties`, where values are URLs linking to actual Actor results in a dataset, key-value store files, or live view web server.
 12 | This output object is generated by system right when the Actor starts withour executing any Actor's code,
 13 | and remains static over entire lifecycle of Actor; only the linked content changes over time as Actor produces the results.
 14 | This is necessary to enable integrations of results to other systems, as you don't need to run an Actor
 15 | to see format of its results as it's predefined by the output schema.
 16 | 
 17 | The output schema is also used by the system to generate the user interface, API examples, integrations, etc.
 18 | 
 19 | ## Structure
 20 | 
 21 | ```jsonc
 22 | {
 23 |   "actorOutputSchemaVersion": 1,
 24 |  
 25 |   "title": "Some title",
 26 |   "description": "This text is shown in the Output UI",
 27 |   "type": "object",
 28 |   
 29 |   "properties": {
 30 |   
 31 |     // This property in output object will contain a URL to the dataset containing Actor results,
 32 |     // for example: https://api.apify.com/v2/datasets/XYZabc/items?format=json&view=product_details
 33 |     "currentProductsDatasetUrl": {
 34 |       // Type is string, because the value in output object is a URL
 35 |       "type": "string",
 36 |       "title": "Current products",
 37 |       "description": "Yaddada",
 38 |       
 39 |       // Identifies what kind of object is refereced by this output property (same syntax as "resourceType" in input schema).
 40 |       // If used, the system will interepret the "source" and render the dataset in UI special way.
 41 |       "resourceType": "dataset",
 42 |       
 43 |       // Defines how the output value is created, using text format where {{x}} denote variables (same syntax as webhook templates)
 44 |       "template": "{{actorRun.defaultDatasetUrl}}?format=json&view=product_details",
 45 |       
 46 |       // Or reference a property from input object, the linkage will be checked for type compatibility
 47 |       // "template": "{{actorInput.myProductsDatasetId}}"
 48 |     },
 49 | 
 50 |     // Selects a specific group of records with a certain prefix. In UI, this can be shown
 51 |     // as a list of images. In the output object, this will be a link to a API with "prefix" param.
 52 |     "productImagesUrl": {
 53 |       "type": "string",
 54 |       "title": "Product screenshots",
 55 |       
 56 |       "resourceType": "keyValueStore",
 57 | 
 58 |       // Define how the URL is created, in this case it will link to the default Actor key-value store
 59 |       "template": "{{actorRun.defaultKeyValueStoreUrl}}?collection=screenshots"
 60 |     },
 61 |     
 62 |     // Example of reference to a file stored in Actor's default key-value store.
 63 |     // In UI can be rendered as a file download.
 64 |     "mainScreenshotFileUrl": {
 65 |       "type": "string",
 66 |       "title": "Main screenshot",
 67 |       "description": "URL to an image with main product screenshot.",
 68 |       "template": "{{actorRun.defaultKeyValueStoreUrl}}/screenshot.png",
 69 |     },
 70 | 
 71 |     // Live view web server for to the Actor
 72 |     // In the "output" view, this page is rendered in an IFRAME
 73 |     "productExplorerWebUrl": {
 74 |       "type": "string",
 75 |       "resourceType": "webServer",
 76 |       "title": "Live product explorer app",
 77 |       "description": "API documentation is available in swagger.com/api/xxxx", // optional
 78 |       
 79 |       // TODO: ideally this should be named {{actorRun.webServerUrl}} for consistency, but we'd need to change ActorRun everywhere
 80 |       "template": "{{actorRun.containerUrl}}/product-explorer/", 
 81 |     }
 82 |   }
 83 | }
 84 | ```
 85 | 
 86 | 
 87 | ## Random notes
 88 | 
 89 | The output schema can reference other datasets/kv-stores/queues
 90 | but only those ones that are referenced in the input, or the default. Hence
 91 | there's no point to include storage schema here again, as it's done elsewhere.
 92 | 
 93 | - **NOTE:** The output schema should enable developers to define schema for the
 94 |   default dataset and key-value store. But how? It should be declarative
 95 |   so that the system can check that e.g. the overridden default dataset
 96 |   has the right schema. But then, when it comes to kv-store, that's not purely
 97 |   output object but INPUT, similarly for overridden dataset or request queue.
 98 |   Perhaps the cleanest way would be to set these directly in `.actor/actor.json`.
 99 | - The Run Sync API could have an option to automatically return (or redirect to?)
100 |   a specific property (i.e. URL) of the output object.
101 |   This would supersede the `outputRecordKey=OUTPUT` API param as well as
102 |   the run-sync-get-dataset-items API endpoint.
103 |   Maybe we could have one of the output properties as the main one,
104 |   which would be used by default for this kind of API endpoint, and just return
105 |   data to user.
106 | - Same as we show Output in UI, we need to autogenerate the OUTPUT in API e.g. JSON format.
107 |   There would be properties like in the output_schema.json file, with e.g. URL to dataset,
108 |   log file, kv-store, live view etc. So it would be an auto-generated field "output"
109 |   that we can add to JSON returned by the Run API endpoints
110 |   (e.g. https://docs.apify.com/api/v2#/reference/actor-tasks/run-collection/run-task)
111 |   - Also see: https://github.com/apify/actor-specs/pull/5#discussion_r775641112
112 |   - `output` will be a property of run object generated from Output schema
113 | 
114 | 
115 | 
116 | ## Examples of ideal Actor run UI
117 | 
118 | - For the majority of Actors, we want to see the dataset with new records being added in realtime
119 | - For [Google Spreadsheet Import](https://apify.com/lukaskrivka/google-sheets),
120 |   we want to first display Live View for the user to set up OAUTH, and once 
121 |    this is set up, then we want to display the log next time.
122 | - For technical Actors, it might be a log
123 | - For [HTML to PDF convertor](https://apify.com/jancurn/url-to-pdf) it's a single record from key-value store
124 | - For [Monitoring](https://apify.com/apify/monitoring-runner) it's log during the runtime and a single HTML record in an iframe in the end
125 | - For an Actor that has failed, it might be the log
126 | 
127 | ## How to define Actor run UI
128 | 
129 | ### Simple version
130 | 
131 | There will be a new tab on Actor run detail for every Actor with output schema called "Output".
132 | This tab will be at the first position and displayed by default. Tab will show the following:
133 | - Items from output schema with property `visible: true` will be rendered in the same order
134 |   as they are in schema
135 | - The live view will be displayed only when it has `visible: true` and when it's active.
136 |   Otherwise, we should show just a short message "This show is over".
137 | - If the dataset has more views then we should have some select or tabs to select the view
138 | 
139 | ### Ideal most comprehensive state
140 | 
141 | - Default setup, i.e., what output components should be displayed at the default run tab
142 | - Optionally, the setup for different states
143 | - Be able to pragmatically changes this using API by Actor itself
144 | 


--------------------------------------------------------------------------------
/pages/REQUEST_QUEUE_SCHEMA.md:
--------------------------------------------------------------------------------
 1 | # Request queue schema file specification [work in progress]
 2 | 
 3 | Currently, this is neither specified nor implemented.
 4 | We think that request queue schema might be useful for two things:
 5 | 
 6 | - ensuring what kind of URLs might be enqueued (certain domains or subdomains, ...)
 7 | - ensure that for example each request has `userData.label`, i.e. schema of `userData` the same way as we enforce it for the Datasets
 8 | 
 9 | We should consider renaming `RequestQueue` to just `Queue` and make it more generic, and then it makes sense to have request schema.
10 | 
11 | **This is to be done**
12 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/amazon_scraper/.actor/ACTOR.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "amazon",
3 |   "template": "puppeteer_crawler",
4 |   "version": "0.1",
5 |   "buildTag": "latest",
6 |   "env": null
7 | }
8 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/amazon_scraper/.actor/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "scraper": {
  3 |         "title": "Use Browser",
  4 |         "type": "boolean",
  5 |         "description": "Keep checked in order to use a real browser in evaluating amazon",
  6 |         "editor": "checkbox",
  7 |         "default": true
  8 |     },
  9 |     "country": {
 10 |         "title": "Amazon market",
 11 |         "type": "string",
 12 |         "description": "Select your Amazon domain",
 13 |         "editor": "select",
 14 |         "default": "US",
 15 |         "enum": [
 16 |             "US",
 17 |             "UK",
 18 |             "DE",
 19 |             "ES",
 20 |             "FR",
 21 |             "IT",
 22 |             "IN",
 23 |             "CA",
 24 |             "JP",
 25 |             "AE",
 26 |             "SA",
 27 |             "BR",
 28 |             "MX",
 29 |             "SG",
 30 |             "TR",
 31 |             "NL",
 32 |             "AU",
 33 |             "SE"
 34 |         ],
 35 |         "enumTitles": [
 36 |             "amazon.com",
 37 |             "amazon.co.uk",
 38 |             "amazon.de",
 39 |             "amazon.es",
 40 |             "amazon.fr",
 41 |             "amazon.it",
 42 |             "amazon.in",
 43 |             "amazon.ca",
 44 |             "amazon.co.jp",
 45 |             "amazon.ae",
 46 |             "amazon.sa",
 47 |             "amazon.com.br",
 48 |             "amazon.com.mx",
 49 |             "amazon.sg",
 50 |             "amazon.com.tr",
 51 |             "amazon.nl",
 52 |             "amazon.com.au",
 53 |             "amazon.se"
 54 |         ]
 55 |     },
 56 |     "category": {
 57 |         "title": "Store Department",
 58 |         "type": "string",
 59 |         "description": "Select the store category",
 60 |         "editor": "select",
 61 |         "default": "aps",
 62 |         "enumTitles": [
 63 |             "All Departments",
 64 |             "Arts & Crafts",
 65 |             "Automotive",
 66 |             "Baby",
 67 |             "Beauty & Personal Care",
 68 |             "Books",
 69 |             "Computers",
 70 |             "Digital Music",
 71 |             "Electronics",
 72 |             "Kindle Store",
 73 |             "Prime Video",
 74 |             "Women's Fashion",
 75 |             "Men's Fashion",
 76 |             "Girls' Fashion",
 77 |             "Boys' Fashion",
 78 |             "Deals",
 79 |             "Health & Household",
 80 |             "Home & Kitchen",
 81 |             "Industrial & Scientific",
 82 |             "Luggage",
 83 |             "Movies & TV",
 84 |             "Music, CDs & Vinyl",
 85 |             "Pet Supplies",
 86 |             "Software",
 87 |             "Sports & Outdoors",
 88 |             "Tools & Home Improvement",
 89 |             "Toys & Games",
 90 |             "Video Games"
 91 |         ],
 92 |         "enum": [
 93 |             "aps",
 94 |             "arts-crafts-intl-ship",
 95 |             "automotive-intl-ship",
 96 |             "baby-products-intl-ship",
 97 |             "beauty-intl-ship",
 98 |             "stripbooks-intl-ship",
 99 |             "computers-intl-ship",
100 |             "digital-music",
101 |             "electronics-intl-ship",
102 |             "digital-text",
103 |             "instant-video",
104 |             "fashion-womens-intl-ship",
105 |             "fashion-mens-intl-ship",
106 |             "fashion-girls-intl-ship",
107 |             "fashion-boys-intl-ship",
108 |             "deals-intl-ship",
109 |             "hpc-intl-ship",
110 |             "kitchen-intl-ship",
111 |             "industrial-intl-ship",
112 |             "luggage-intl-ship",
113 |             "movies-tv-intl-ship",
114 |             "music-intl-ship",
115 |             "pets-intl-ship",
116 |             "software-intl-ship",
117 |             "sporting-intl-ship",
118 |             "tools-intl-ship",
119 |             "toys-and-games-intl-ship",
120 |             "videogames-intl-ship"
121 |         ]
122 |     },
123 |     "searchType": {
124 |         "title": "Select the type of search you would like",
125 |         "type": "string",
126 |         "description": "Select the type of search to perform from a choice of keywords, asins or direct Urls",
127 |         "editor": "select",
128 |         "default": "keywords",
129 |         "enum": [
130 |             "keywords",
131 |             "asins",
132 |             "directUrls"
133 |         ],
134 |         "enumTitles": [
135 |             "Keywords",
136 |             "ASINs",
137 |             "Direct URLs"
138 |         ]
139 |     },
140 |     "search": {
141 |         "title": "Search",
142 |         "type": "string",
143 |         "description": "Keywords, asins or directUrls you would like to extract from Amazon, comma separated",
144 |         "prefill": "Iphone X,Samsung monitor 27 QHD",
145 |         "editor": "textarea"
146 |     },
147 |     "maxResults": {
148 |         "title": "Number of results",
149 |         "type": "integer",
150 |         "description": "Number of results you would like to save in total.",
151 |         "editor": "number"
152 |     },
153 |     "proxy": {
154 |         "title": "Proxy configuration",
155 |         "type": "object",
156 |         "description": "Select proxies to be used by your crawler.",
157 |         "prefill": {
158 |             "useApifyProxy": true
159 |         },
160 |         "editor": "proxy"
161 |     },
162 |     "maxReviews": {
163 |         "title": "Number of reviews",
164 |         "type": "integer",
165 |         "description": "Number of reviews you would like to save per product.",
166 |         "default": 0,
167 |         "editor": "number"
168 |     },
169 |     "delivery": {
170 |         "title": "Delivery Location",
171 |         "type": "string",
172 |         "description": "Select the location you would like the product to be delivered to",
173 |         "editor": "select",
174 |         "default": "",
175 |         "enum": [
176 |             "",
177 |             "AU,GLUXCountryList_0",
178 |             "CA,GLUXCountryList_1",
179 |             "CN,GLUXCountryList_2",
180 |             "JP,GLUXCountryList_3",
181 |             "MX,GLUXCountryList_4",
182 |             "SG,GLUXCountryList_5",
183 |             "GB,GLUXCountryList_6",
184 |             "AF,GLUXCountryList_7",
185 |             "AX,GLUXCountryList_8",
186 |             "AL,GLUXCountryList_9",
187 |             "DZ,GLUXCountryList_10",
188 |             "AS,GLUXCountryList_11",
189 |             "AD,GLUXCountryList_12",
190 |             "AO,GLUXCountryList_13",
191 |             "AI,GLUXCountryList_14",
192 |             "AG,GLUXCountryList_15",
193 |             "AR,GLUXCountryList_16",
194 |             "AM,GLUXCountryList_17",
195 |             "AW,GLUXCountryList_18",
196 |             "AU,GLUXCountryList_19",
197 |             "AT,GLUXCountryList_20",
198 |             "AZ,GLUXCountryList_21",
199 |             "BS,GLUXCountryList_22",
200 |             "BH,GLUXCountryList_23",
201 |             "BD,GLUXCountryList_24",
202 |             "BB,GLUXCountryList_25",
203 |             "BY,GLUXCountryList_26",
204 |             "BE,GLUXCountryList_27",
205 |             "BZ,GLUXCountryList_28",
206 |             "BJ,GLUXCountryList_29",
207 |             "BM,GLUXCountryList_30",
208 |             "BT,GLUXCountryList_31",
209 |             "BO,GLUXCountryList_32",
210 |             "BQ,GLUXCountryList_33",
211 |             "BA,GLUXCountryList_34",
212 |             "BW,GLUXCountryList_35",
213 |             "BV,GLUXCountryList_36",
214 |             "BR,GLUXCountryList_37",
215 |             "IO,GLUXCountryList_38",
216 |             "BN,GLUXCountryList_39",
217 |             "BG,GLUXCountryList_40",
218 |             "BF,GLUXCountryList_41",
219 |             "BI,GLUXCountryList_42",
220 |             "KH,GLUXCountryList_43",
221 |             "CM,GLUXCountryList_44",
222 |             "CA,GLUXCountryList_45",
223 |             "CV,GLUXCountryList_46",
224 |             "KY,GLUXCountryList_47",
225 |             "CF,GLUXCountryList_48",
226 |             "TD,GLUXCountryList_49",
227 |             "CL,GLUXCountryList_50",
228 |             "CN,GLUXCountryList_51",
229 |             "CX,GLUXCountryList_52",
230 |             "CC,GLUXCountryList_53",
231 |             "CO,GLUXCountryList_54",
232 |             "KM,GLUXCountryList_55",
233 |             "CG,GLUXCountryList_56",
234 |             "CD,GLUXCountryList_57",
235 |             "CK,GLUXCountryList_58",
236 |             "CR,GLUXCountryList_59",
237 |             "CI,GLUXCountryList_60",
238 |             "HR,GLUXCountryList_61",
239 |             "CW,GLUXCountryList_62",
240 |             "CY,GLUXCountryList_63",
241 |             "CZ,GLUXCountryList_64",
242 |             "DK,GLUXCountryList_65",
243 |             "DJ,GLUXCountryList_66",
244 |             "DM,GLUXCountryList_67",
245 |             "DO,GLUXCountryList_68",
246 |             "EC,GLUXCountryList_69",
247 |             "EG,GLUXCountryList_70",
248 |             "SV,GLUXCountryList_71",
249 |             "GQ,GLUXCountryList_72",
250 |             "ER,GLUXCountryList_73",
251 |             "EE,GLUXCountryList_74",
252 |             "ET,GLUXCountryList_75",
253 |             "FK,GLUXCountryList_76",
254 |             "FO,GLUXCountryList_77",
255 |             "FJ,GLUXCountryList_78",
256 |             "FI,GLUXCountryList_79",
257 |             "FR,GLUXCountryList_80",
258 |             "GF,GLUXCountryList_81",
259 |             "PF,GLUXCountryList_82",
260 |             "TF,GLUXCountryList_83",
261 |             "GA,GLUXCountryList_84",
262 |             "GM,GLUXCountryList_85",
263 |             "GE,GLUXCountryList_86",
264 |             "DE,GLUXCountryList_87",
265 |             "GH,GLUXCountryList_88",
266 |             "GI,GLUXCountryList_89",
267 |             "GR,GLUXCountryList_90",
268 |             "GL,GLUXCountryList_91",
269 |             "GD,GLUXCountryList_92",
270 |             "GP,GLUXCountryList_93",
271 |             "GT,GLUXCountryList_94",
272 |             "GG,GLUXCountryList_95",
273 |             "GN,GLUXCountryList_96",
274 |             "GW,GLUXCountryList_97",
275 |             "GY,GLUXCountryList_98",
276 |             "HT,GLUXCountryList_99",
277 |             "HM,GLUXCountryList_100",
278 |             "VA,GLUXCountryList_101",
279 |             "HN,GLUXCountryList_102",
280 |             "HK,GLUXCountryList_103",
281 |             "HU,GLUXCountryList_104",
282 |             "IS,GLUXCountryList_105",
283 |             "IN,GLUXCountryList_106",
284 |             "ID,GLUXCountryList_107",
285 |             "IQ,GLUXCountryList_108",
286 |             "IE,GLUXCountryList_109",
287 |             "IM,GLUXCountryList_110",
288 |             "IL,GLUXCountryList_111",
289 |             "IT,GLUXCountryList_112",
290 |             "JM,GLUXCountryList_113",
291 |             "JP,GLUXCountryList_114",
292 |             "JE,GLUXCountryList_115",
293 |             "JO,GLUXCountryList_116",
294 |             "KZ,GLUXCountryList_117",
295 |             "KE,GLUXCountryList_118",
296 |             "KI,GLUXCountryList_119",
297 |             "KR,GLUXCountryList_120",
298 |             "XK,GLUXCountryList_121",
299 |             "KW,GLUXCountryList_122",
300 |             "KG,GLUXCountryList_123",
301 |             "LA,GLUXCountryList_124",
302 |             "LV,GLUXCountryList_125",
303 |             "LB,GLUXCountryList_126",
304 |             "LS,GLUXCountryList_127",
305 |             "LR,GLUXCountryList_128",
306 |             "LY,GLUXCountryList_129",
307 |             "LI,GLUXCountryList_130",
308 |             "LT,GLUXCountryList_131",
309 |             "LU,GLUXCountryList_132",
310 |             "MO,GLUXCountryList_133",
311 |             "MK,GLUXCountryList_134",
312 |             "MG,GLUXCountryList_135",
313 |             "MW,GLUXCountryList_136",
314 |             "MY,GLUXCountryList_137",
315 |             "MV,GLUXCountryList_138",
316 |             "ML,GLUXCountryList_139",
317 |             "MT,GLUXCountryList_140",
318 |             "MH,GLUXCountryList_141",
319 |             "MQ,GLUXCountryList_142",
320 |             "MR,GLUXCountryList_143",
321 |             "MU,GLUXCountryList_144",
322 |             "YT,GLUXCountryList_145",
323 |             "MX,GLUXCountryList_146",
324 |             "FM,GLUXCountryList_147",
325 |             "MD,GLUXCountryList_148",
326 |             "MC,GLUXCountryList_149",
327 |             "MN,GLUXCountryList_150",
328 |             "ME,GLUXCountryList_151",
329 |             "MS,GLUXCountryList_152",
330 |             "MA,GLUXCountryList_153",
331 |             "MZ,GLUXCountryList_154",
332 |             "MM,GLUXCountryList_155",
333 |             "NA,GLUXCountryList_156",
334 |             "NR,GLUXCountryList_157",
335 |             "NP,GLUXCountryList_158",
336 |             "NL,GLUXCountryList_159",
337 |             "AN,GLUXCountryList_160",
338 |             "NC,GLUXCountryList_161",
339 |             "NZ,GLUXCountryList_162",
340 |             "NI,GLUXCountryList_163",
341 |             "NE,GLUXCountryList_164",
342 |             "NG,GLUXCountryList_165",
343 |             "NU,GLUXCountryList_166",
344 |             "NF,GLUXCountryList_167",
345 |             "NO,GLUXCountryList_168",
346 |             "OM,GLUXCountryList_169",
347 |             "PK,GLUXCountryList_170",
348 |             "PW,GLUXCountryList_171",
349 |             "PS,GLUXCountryList_172",
350 |             "PA,GLUXCountryList_173",
351 |             "PG,GLUXCountryList_174",
352 |             "PY,GLUXCountryList_175",
353 |             "PE,GLUXCountryList_176",
354 |             "PH,GLUXCountryList_177",
355 |             "PN,GLUXCountryList_178",
356 |             "PL,GLUXCountryList_179",
357 |             "PT,GLUXCountryList_180",
358 |             "QA,GLUXCountryList_181",
359 |             "RE,GLUXCountryList_182",
360 |             "RO,GLUXCountryList_183",
361 |             "RU,GLUXCountryList_184",
362 |             "RW,GLUXCountryList_185",
363 |             "BL,GLUXCountryList_186",
364 |             "SH,GLUXCountryList_187",
365 |             "KN,GLUXCountryList_188",
366 |             "LC,GLUXCountryList_189",
367 |             "MF,GLUXCountryList_190",
368 |             "PM,GLUXCountryList_191",
369 |             "VC,GLUXCountryList_192",
370 |             "WS,GLUXCountryList_193",
371 |             "SM,GLUXCountryList_194",
372 |             "ST,GLUXCountryList_195",
373 |             "SA,GLUXCountryList_196",
374 |             "SN,GLUXCountryList_197",
375 |             "RS,GLUXCountryList_198",
376 |             "SC,GLUXCountryList_199",
377 |             "SL,GLUXCountryList_200",
378 |             "SG,GLUXCountryList_201",
379 |             "SX,GLUXCountryList_202",
380 |             "SK,GLUXCountryList_203",
381 |             "SI,GLUXCountryList_204",
382 |             "SB,GLUXCountryList_205",
383 |             "SO,GLUXCountryList_206",
384 |             "ZA,GLUXCountryList_207",
385 |             "GS,GLUXCountryList_208",
386 |             "ES,GLUXCountryList_209",
387 |             "LK,GLUXCountryList_210",
388 |             "SR,GLUXCountryList_211",
389 |             "SJ,GLUXCountryList_212",
390 |             "SZ,GLUXCountryList_213",
391 |             "SE,GLUXCountryList_214",
392 |             "CH,GLUXCountryList_215",
393 |             "TW,GLUXCountryList_216",
394 |             "TJ,GLUXCountryList_217",
395 |             "TZ,GLUXCountryList_218",
396 |             "TH,GLUXCountryList_219",
397 |             "TL,GLUXCountryList_220",
398 |             "TG,GLUXCountryList_221",
399 |             "TK,GLUXCountryList_222",
400 |             "TO,GLUXCountryList_223",
401 |             "TT,GLUXCountryList_224",
402 |             "TN,GLUXCountryList_225",
403 |             "TR,GLUXCountryList_226",
404 |             "TM,GLUXCountryList_227",
405 |             "TC,GLUXCountryList_228",
406 |             "TV,GLUXCountryList_229",
407 |             "UG,GLUXCountryList_230",
408 |             "UA,GLUXCountryList_231",
409 |             "AE,GLUXCountryList_232",
410 |             "GB,GLUXCountryList_233",
411 |             "UM,GLUXCountryList_234",
412 |             "UY,GLUXCountryList_235",
413 |             "UZ,GLUXCountryList_236",
414 |             "VU,GLUXCountryList_237",
415 |             "VE,GLUXCountryList_238",
416 |             "VN,GLUXCountryList_239",
417 |             "VG,GLUXCountryList_240",
418 |             "WF,GLUXCountryList_241",
419 |             "EH,GLUXCountryList_242",
420 |             "YE,GLUXCountryList_243",
421 |             "ZM,GLUXCountryList_244",
422 |             "ZW,GLUXCountryList_245"
423 |         ],
424 |         "enumTitles": [
425 |             "Default",
426 |             "Australia",
427 |             "Canada",
428 |             "China",
429 |             "Japan",
430 |             "Mexico",
431 |             "Singapore",
432 |             "United Kingdom",
433 |             "Afghanistan",
434 |             "Aland Islands",
435 |             "Albania",
436 |             "Algeria",
437 |             "American Samoa",
438 |             "Andorra",
439 |             "Angola",
440 |             "Anguilla",
441 |             "Antigua and Barbuda",
442 |             "Argentina",
443 |             "Armenia",
444 |             "Aruba",
445 |             "Australia",
446 |             "Austria",
447 |             "Azerbaijan",
448 |             "Bahamas, The",
449 |             "Bahrain",
450 |             "Bangladesh",
451 |             "Barbados",
452 |             "Belarus",
453 |             "Belgium",
454 |             "Belize",
455 |             "Benin",
456 |             "Bermuda",
457 |             "Bhutan",
458 |             "Bolivia",
459 |             "Bonaire, Saint Eustatius and Saba",
460 |             "Bosnia and Herzegovina",
461 |             "Botswana",
462 |             "Bouvet Island",
463 |             "Brazil",
464 |             "British Indian Ocean Territory",
465 |             "Brunei Darussalam",
466 |             "Bulgaria",
467 |             "Burkina Faso",
468 |             "Burundi",
469 |             "Cambodia",
470 |             "Cameroon",
471 |             "Canada",
472 |             "Cape Verde",
473 |             "Cayman Islands",
474 |             "Central African Republic",
475 |             "Chad",
476 |             "Chile",
477 |             "China",
478 |             "Christmas Island",
479 |             "Cocos (Keeling) Islands",
480 |             "Colombia",
481 |             "Comoros",
482 |             "Congo",
483 |             "Congo, The Democratic Republic of the",
484 |             "Cook Islands",
485 |             "Costa Rica",
486 |             "Cote D'ivoire",
487 |             "Croatia",
488 |             "Curaçao",
489 |             "Cyprus",
490 |             "Czech Republic",
491 |             "Denmark",
492 |             "Djibouti",
493 |             "Dominica",
494 |             "Dominican Republic",
495 |             "Ecuador",
496 |             "Egypt",
497 |             "El Salvador",
498 |             "Equatorial Guinea",
499 |             "Eritrea",
500 |             "Estonia",
501 |             "Ethiopia",
502 |             "Falkland Islands (Malvinas)",
503 |             "Faroe Islands",
504 |             "Fiji",
505 |             "Finland",
506 |             "France",
507 |             "French Guiana",
508 |             "French Polynesia",
509 |             "French Southern Territories",
510 |             "Gabon",
511 |             "Gambia, The",
512 |             "Georgia",
513 |             "Germany",
514 |             "Ghana",
515 |             "Gibraltar",
516 |             "Greece",
517 |             "Greenland",
518 |             "Grenada",
519 |             "Guadeloupe",
520 |             "Guatemala",
521 |             "Guernsey",
522 |             "Guinea",
523 |             "Guinea-Bissau",
524 |             "Guyana",
525 |             "Haiti",
526 |             "Heard Island and the McDonald Islands",
527 |             "Holy See",
528 |             "Honduras",
529 |             "Hong Kong",
530 |             "Hungary",
531 |             "Iceland",
532 |             "India",
533 |             "Indonesia",
534 |             "Iraq",
535 |             "Ireland",
536 |             "Isle of Man",
537 |             "Israel",
538 |             "Italy",
539 |             "Jamaica",
540 |             "Japan",
541 |             "Jersey",
542 |             "Jordan",
543 |             "Kazakhstan",
544 |             "Kenya",
545 |             "Kiribati",
546 |             "Korea, Republic of",
547 |             "Kosovo",
548 |             "Kuwait",
549 |             "Kyrgyzstan",
550 |             "Lao People's Democratic Republic",
551 |             "Latvia",
552 |             "Lebanon",
553 |             "Lesotho",
554 |             "Liberia",
555 |             "Libya",
556 |             "Liechtenstein",
557 |             "Lithuania",
558 |             "Luxembourg",
559 |             "Macao",
560 |             "Macedonia, The Former Yugoslav Republic of",
561 |             "Madagascar",
562 |             "Malawi",
563 |             "Malaysia",
564 |             "Maldives",
565 |             "Mali",
566 |             "Malta",
567 |             "Marshall Islands",
568 |             "Martinique",
569 |             "Mauritania",
570 |             "Mauritius",
571 |             "Mayotte",
572 |             "Mexico",
573 |             "Micronesia, Federated States of",
574 |             "Moldova, Republic of",
575 |             "Monaco",
576 |             "Mongolia",
577 |             "Montenegro",
578 |             "Montserrat",
579 |             "Morocco",
580 |             "Mozambique",
581 |             "Myanmar",
582 |             "Namibia",
583 |             "Nauru",
584 |             "Nepal",
585 |             "Netherlands",
586 |             "Netherlands Antilles",
587 |             "New Caledonia",
588 |             "New Zealand",
589 |             "Nicaragua",
590 |             "Niger",
591 |             "Nigeria",
592 |             "Niue",
593 |             "Norfolk Island",
594 |             "Norway",
595 |             "Oman",
596 |             "Pakistan",
597 |             "Palau",
598 |             "Palestinian Territories",
599 |             "Panama",
600 |             "Papua New Guinea",
601 |             "Paraguay",
602 |             "Peru",
603 |             "Philippines",
604 |             "Pitcairn",
605 |             "Poland",
606 |             "Portugal",
607 |             "Qatar",
608 |             "Reunion",
609 |             "Romania",
610 |             "Russian Federation",
611 |             "Rwanda",
612 |             "Saint Barthelemy",
613 |             "Saint Helena, Ascension and Tristan da Cunha",
614 |             "Saint Kitts and Nevis",
615 |             "Saint Lucia",
616 |             "Saint Martin",
617 |             "Saint Pierre and Miquelon",
618 |             "Saint Vincent and the Grenadines",
619 |             "Samoa",
620 |             "San Marino",
621 |             "Sao Tome and Principe",
622 |             "Saudi Arabia",
623 |             "Senegal",
624 |             "Serbia",
625 |             "Seychelles",
626 |             "Sierra Leone",
627 |             "Singapore",
628 |             "Sint Maarten",
629 |             "Slovakia",
630 |             "Slovenia",
631 |             "Solomon Islands",
632 |             "Somalia",
633 |             "South Africa",
634 |             "South Georgia and the South Sandwich Islands",
635 |             "Spain",
636 |             "Sri Lanka",
637 |             "Suriname",
638 |             "Svalbard and Jan Mayen",
639 |             "Swaziland",
640 |             "Sweden",
641 |             "Switzerland",
642 |             "Taiwan",
643 |             "Tajikistan",
644 |             "Tanzania, United Republic of",
645 |             "Thailand",
646 |             "Timor-leste",
647 |             "Togo",
648 |             "Tokelau",
649 |             "Tonga",
650 |             "Trinidad and Tobago",
651 |             "Tunisia",
652 |             "Turkey",
653 |             "Turkmenistan",
654 |             "Turks and Caicos Islands",
655 |             "Tuvalu",
656 |             "Uganda",
657 |             "Ukraine",
658 |             "United Arab Emirates",
659 |             "United Kingdom",
660 |             "United States Minor Outlying Islands",
661 |             "Uruguay",
662 |             "Uzbekistan",
663 |             "Vanuatu",
664 |             "Venezuela",
665 |             "Vietnam",
666 |             "Virgin Islands, British",
667 |             "Wallis and Futuna",
668 |             "Western Sahara",
669 |             "Yemen",
670 |             "Zambia",
671 |             "Zimbabwe"
672 |         ]
673 |     }
674 | }
675 | 
676 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/amazon_scraper/.actor/OUTPUT_SCHEMA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "products": {
 3 |         // ...
 4 |     },
 5 | 
 6 |     "pageScreenshots": {
 7 | 
 8 |     },
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/dataset-viewer/.actor/OUTPUT_SCHEMA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "staticChart": {
 3 |     "title": "Static Chart",
 4 |     "description": "The static HTML .",
 5 |     "type/source": "key-value-store",
 6 |     "file": "view.html" // ???
 7 |   },
 8 | 
 9 |   // log, requestQueue, OUTPUT.json, ..., adhoc-webhooks?
10 | 
11 |   "dynamicChart": {
12 |     "title": "Dynamic chart",
13 |     "description": "Web browser showing rich interactive view.",
14 |     // This type says that the result is available in live-view, and can be rendered as HTML
15 |     "type/source": "live-view",
16 |     "defaultView": true,
17 |     // Perhaps better way is:
18 |     "source": "live-view",
19 |     "viewer": "iframe" / "modal-iframe" ???
20 |   },
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/ACTOR.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "formatVersion": 2,
 3 |     // No username here, Actor can be deployed to any account
 4 |     "name": "google-search-scraper",
 5 |     // We're pushing towards having human readable names shown for Actors everywhere,
 6 |     // so we should probably let users define it here, even if they run this code outside of Apify.
 7 |     // But shall the text from here overwrite changes done manually by copywriter? Probably not,
 8 |     // so what's the purpose of having these here?
 9 |     "title": "Google Search Scraper",
10 |     "description": "The 200-char description",
11 |     "version": "0.0",
12 |     "buildTag": "latest",
13 |     "env": {
14 |         "MYSQL_USER": "my_username",
15 |         "MYSQL_PASSWORD": "@mySecretPassword"
16 |     },
17 |     "template": "basic"
18 | }
19 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "queries": {
  3 |         "title": "Search queries or URLs",
  4 |         "type": "string",
  5 |         "description": "Google Search queries (e.g. <code>food in NYC</code>) and/or full URLs (e.g. <code>https://www.google.com/search?q=food+NYC</code>).<br><br>Enter one item per line.",
  6 |         "prefill": "Hotels in NYC\nRestaurants in NYC\nhttps://www.google.com/search?q=restaurants+in+NYC",
  7 |         "editor": "textarea",
  8 |         "pattern": "[^\\s]+"
  9 |     },
 10 |     "countryCode": {
 11 |         "title": "Country",
 12 |         "type": "string",
 13 |         "description": "Country determines the IP address of the proxy used for the query and the Google Search domain (e.g. <code>google.es</code> for Spain). The values must be lower-cased <a href=\"https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2\" target=\"_blank\">ISO 3166</a> country codes supported by Google. By default, the Actor uses United States (<code>google.com</code>).<br><br>This setting only applies to <strong>Search queries</strong>, but not to <strong>URLs</strong>.",
 14 |         "default": "",
 15 |         "editor": "select",
 16 |         "enum": [
 17 |             "",
 18 |             "af",
 19 |             "al",
 20 |             "dz",
 21 |             "as",
 22 |             "ad",
 23 |             "ao",
 24 |             "ai",
 25 |             "aq",
 26 |             "ag",
 27 |             "ar",
 28 |             "am",
 29 |             "aw",
 30 |             "au",
 31 |             "at",
 32 |             "az",
 33 |             "bs",
 34 |             "bh",
 35 |             "bd",
 36 |             "bb",
 37 |             "by",
 38 |             "be",
 39 |             "bz",
 40 |             "bj",
 41 |             "bm",
 42 |             "bt",
 43 |             "bo",
 44 |             "ba",
 45 |             "bw",
 46 |             "bv",
 47 |             "br",
 48 |             "io",
 49 |             "bn",
 50 |             "bg",
 51 |             "bf",
 52 |             "bi",
 53 |             "kh",
 54 |             "cm",
 55 |             "ca",
 56 |             "cv",
 57 |             "ky",
 58 |             "cf",
 59 |             "td",
 60 |             "cl",
 61 |             "cn",
 62 |             "cx",
 63 |             "cc",
 64 |             "co",
 65 |             "km",
 66 |             "cg",
 67 |             "cd",
 68 |             "ck",
 69 |             "cr",
 70 |             "ci",
 71 |             "hr",
 72 |             "cu",
 73 |             "cy",
 74 |             "cz",
 75 |             "dk",
 76 |             "dj",
 77 |             "dm",
 78 |             "do",
 79 |             "ec",
 80 |             "eg",
 81 |             "sv",
 82 |             "gq",
 83 |             "er",
 84 |             "ee",
 85 |             "et",
 86 |             "fk",
 87 |             "fo",
 88 |             "fj",
 89 |             "fi",
 90 |             "fr",
 91 |             "gf",
 92 |             "pf",
 93 |             "tf",
 94 |             "ga",
 95 |             "gm",
 96 |             "ge",
 97 |             "de",
 98 |             "gh",
 99 |             "gi",
100 |             "gr",
101 |             "gl",
102 |             "gd",
103 |             "gp",
104 |             "gu",
105 |             "gt",
106 |             "gn",
107 |             "gw",
108 |             "gy",
109 |             "ht",
110 |             "hm",
111 |             "va",
112 |             "hn",
113 |             "hk",
114 |             "hu",
115 |             "is",
116 |             "in",
117 |             "id",
118 |             "ir",
119 |             "iq",
120 |             "ie",
121 |             "il",
122 |             "it",
123 |             "jm",
124 |             "jp",
125 |             "jo",
126 |             "kz",
127 |             "ke",
128 |             "ki",
129 |             "kp",
130 |             "kr",
131 |             "kw",
132 |             "kg",
133 |             "la",
134 |             "lv",
135 |             "lb",
136 |             "ls",
137 |             "lr",
138 |             "ly",
139 |             "li",
140 |             "lt",
141 |             "lu",
142 |             "mo",
143 |             "mk",
144 |             "mg",
145 |             "mw",
146 |             "my",
147 |             "mv",
148 |             "ml",
149 |             "mt",
150 |             "mh",
151 |             "mq",
152 |             "mr",
153 |             "mu",
154 |             "yt",
155 |             "mx",
156 |             "fm",
157 |             "md",
158 |             "mc",
159 |             "mn",
160 |             "ms",
161 |             "ma",
162 |             "mz",
163 |             "mm",
164 |             "na",
165 |             "nr",
166 |             "np",
167 |             "nl",
168 |             "an",
169 |             "nc",
170 |             "nz",
171 |             "ni",
172 |             "ne",
173 |             "ng",
174 |             "nu",
175 |             "nf",
176 |             "mp",
177 |             "no",
178 |             "om",
179 |             "pk",
180 |             "pw",
181 |             "ps",
182 |             "pa",
183 |             "pg",
184 |             "py",
185 |             "pe",
186 |             "ph",
187 |             "pn",
188 |             "pl",
189 |             "pt",
190 |             "pr",
191 |             "qa",
192 |             "re",
193 |             "ro",
194 |             "ru",
195 |             "rw",
196 |             "sh",
197 |             "kn",
198 |             "lc",
199 |             "pm",
200 |             "vc",
201 |             "ws",
202 |             "sm",
203 |             "st",
204 |             "sa",
205 |             "sn",
206 |             "cs",
207 |             "sc",
208 |             "sl",
209 |             "sg",
210 |             "sk",
211 |             "si",
212 |             "sb",
213 |             "so",
214 |             "za",
215 |             "gs",
216 |             "es",
217 |             "lk",
218 |             "sd",
219 |             "sr",
220 |             "sj",
221 |             "sz",
222 |             "se",
223 |             "ch",
224 |             "sy",
225 |             "tw",
226 |             "tj",
227 |             "tz",
228 |             "th",
229 |             "tl",
230 |             "tg",
231 |             "tk",
232 |             "to",
233 |             "tt",
234 |             "tn",
235 |             "tr",
236 |             "tm",
237 |             "tc",
238 |             "tv",
239 |             "ug",
240 |             "ua",
241 |             "ae",
242 |             "gb",
243 |             "us",
244 |             "um",
245 |             "uy",
246 |             "uz",
247 |             "vu",
248 |             "ve",
249 |             "vn",
250 |             "vg",
251 |             "vi",
252 |             "wf",
253 |             "eh",
254 |             "ye",
255 |             "zm",
256 |             "zw"
257 |         ],
258 |         "enumTitles": [
259 |             "United States",
260 |             "Afghanistan",
261 |             "Albania",
262 |             "Algeria",
263 |             "American Samoa",
264 |             "Andorra",
265 |             "Angola",
266 |             "Anguilla",
267 |             "Antarctica",
268 |             "Antigua and Barbuda",
269 |             "Argentina",
270 |             "Armenia",
271 |             "Aruba",
272 |             "Australia",
273 |             "Austria",
274 |             "Azerbaijan",
275 |             "Bahamas",
276 |             "Bahrain",
277 |             "Bangladesh",
278 |             "Barbados",
279 |             "Belarus",
280 |             "Belgium",
281 |             "Belize",
282 |             "Benin",
283 |             "Bermuda",
284 |             "Bhutan",
285 |             "Bolivia",
286 |             "Bosnia and Herzegovina",
287 |             "Botswana",
288 |             "Bouvet Island",
289 |             "Brazil",
290 |             "British Indian Ocean Territory",
291 |             "Brunei Darussalam",
292 |             "Bulgaria",
293 |             "Burkina Faso",
294 |             "Burundi",
295 |             "Cambodia",
296 |             "Cameroon",
297 |             "Canada",
298 |             "Cape Verde",
299 |             "Cayman Islands",
300 |             "Central African Republic",
301 |             "Chad",
302 |             "Chile",
303 |             "China",
304 |             "Christmas Island",
305 |             "Cocos (Keeling) Islands",
306 |             "Colombia",
307 |             "Comoros",
308 |             "Congo",
309 |             "Congo, the Democratic Republic of the",
310 |             "Cook Islands",
311 |             "Costa Rica",
312 |             "Cote D'ivoire",
313 |             "Croatia",
314 |             "Cuba",
315 |             "Cyprus",
316 |             "Czech Republic",
317 |             "Denmark",
318 |             "Djibouti",
319 |             "Dominica",
320 |             "Dominican Republic",
321 |             "Ecuador",
322 |             "Egypt",
323 |             "El Salvador",
324 |             "Equatorial Guinea",
325 |             "Eritrea",
326 |             "Estonia",
327 |             "Ethiopia",
328 |             "Falkland Islands (Malvinas)",
329 |             "Faroe Islands",
330 |             "Fiji",
331 |             "Finland",
332 |             "France",
333 |             "French Guiana",
334 |             "French Polynesia",
335 |             "French Southern Territories",
336 |             "Gabon",
337 |             "Gambia",
338 |             "Georgia",
339 |             "Germany",
340 |             "Ghana",
341 |             "Gibraltar",
342 |             "Greece",
343 |             "Greenland",
344 |             "Grenada",
345 |             "Guadeloupe",
346 |             "Guam",
347 |             "Guatemala",
348 |             "Guinea",
349 |             "Guinea-Bissau",
350 |             "Guyana",
351 |             "Haiti",
352 |             "Heard Island and Mcdonald Islands",
353 |             "Holy See (Vatican City State)",
354 |             "Honduras",
355 |             "Hong Kong",
356 |             "Hungary",
357 |             "Iceland",
358 |             "India",
359 |             "Indonesia",
360 |             "Iran, Islamic Republic of",
361 |             "Iraq",
362 |             "Ireland",
363 |             "Israel",
364 |             "Italy",
365 |             "Jamaica",
366 |             "Japan",
367 |             "Jordan",
368 |             "Kazakhstan",
369 |             "Kenya",
370 |             "Kiribati",
371 |             "Korea, Democratic People's Republic of",
372 |             "Korea, Republic of",
373 |             "Kuwait",
374 |             "Kyrgyzstan",
375 |             "Lao People's Democratic Republic",
376 |             "Latvia",
377 |             "Lebanon",
378 |             "Lesotho",
379 |             "Liberia",
380 |             "Libyan Arab Jamahiriya",
381 |             "Liechtenstein",
382 |             "Lithuania",
383 |             "Luxembourg",
384 |             "Macao",
385 |             "Macedonia, the Former Yugosalv Republic of",
386 |             "Madagascar",
387 |             "Malawi",
388 |             "Malaysia",
389 |             "Maldives",
390 |             "Mali",
391 |             "Malta",
392 |             "Marshall Islands",
393 |             "Martinique",
394 |             "Mauritania",
395 |             "Mauritius",
396 |             "Mayotte",
397 |             "Mexico",
398 |             "Micronesia, Federated States of",
399 |             "Moldova, Republic of",
400 |             "Monaco",
401 |             "Mongolia",
402 |             "Montserrat",
403 |             "Morocco",
404 |             "Mozambique",
405 |             "Myanmar",
406 |             "Namibia",
407 |             "Nauru",
408 |             "Nepal",
409 |             "Netherlands",
410 |             "Netherlands Antilles",
411 |             "New Caledonia",
412 |             "New Zealand",
413 |             "Nicaragua",
414 |             "Niger",
415 |             "Nigeria",
416 |             "Niue",
417 |             "Norfolk Island",
418 |             "Northern Mariana Islands",
419 |             "Norway",
420 |             "Oman",
421 |             "Pakistan",
422 |             "Palau",
423 |             "Palestinian Territory, Occupied",
424 |             "Panama",
425 |             "Papua New Guinea",
426 |             "Paraguay",
427 |             "Peru",
428 |             "Philippines",
429 |             "Pitcairn",
430 |             "Poland",
431 |             "Portugal",
432 |             "Puerto Rico",
433 |             "Qatar",
434 |             "Reunion",
435 |             "Romania",
436 |             "Russian Federation",
437 |             "Rwanda",
438 |             "Saint Helena",
439 |             "Saint Kitts and Nevis",
440 |             "Saint Lucia",
441 |             "Saint Pierre and Miquelon",
442 |             "Saint Vincent and the Grenadines",
443 |             "Samoa",
444 |             "San Marino",
445 |             "Sao Tome and Principe",
446 |             "Saudi Arabia",
447 |             "Senegal",
448 |             "Serbia and Montenegro",
449 |             "Seychelles",
450 |             "Sierra Leone",
451 |             "Singapore",
452 |             "Slovakia",
453 |             "Slovenia",
454 |             "Solomon Islands",
455 |             "Somalia",
456 |             "South Africa",
457 |             "South Georgia and the South Sandwich Islands",
458 |             "Spain",
459 |             "Sri Lanka",
460 |             "Sudan",
461 |             "Suriname",
462 |             "Svalbard and Jan Mayen",
463 |             "Swaziland",
464 |             "Sweden",
465 |             "Switzerland",
466 |             "Syrian Arab Republic",
467 |             "Taiwan, Province of China",
468 |             "Tajikistan",
469 |             "Tanzania, United Republic of",
470 |             "Thailand",
471 |             "Timor-Leste",
472 |             "Togo",
473 |             "Tokelau",
474 |             "Tonga",
475 |             "Trinidad and Tobago",
476 |             "Tunisia",
477 |             "Turkey",
478 |             "Turkmenistan",
479 |             "Turks and Caicos Islands",
480 |             "Tuvalu",
481 |             "Uganda",
482 |             "Ukraine",
483 |             "United Arab Emirates",
484 |             "United Kingdom",
485 |             "United States",
486 |             "United States Minor Outlying Islands",
487 |             "Uruguay",
488 |             "Uzbekistan",
489 |             "Vanuatu",
490 |             "Venezuela",
491 |             "Viet Nam",
492 |             "Virgin Islands, British",
493 |             "Virgin Islands, U.S.",
494 |             "Wallis and Futuna",
495 |             "Western Sahara",
496 |             "Yemen",
497 |             "Zambia",
498 |             "Zimbabwe"
499 |         ]
500 |     },
501 |     "languageCode": {
502 |         "title": "Language",
503 |         "type": "string",
504 |         "description": "Language for the search results, which is passed to Google Search as the <code>hl</code> URL query parameter. Only set this if you want to use a non-default language for the selected country. The values must be lower-cased <a href=\"https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes\" target=\"_blank\">ISO 639</a> language codes supported by Google.<br><br>This setting only applies to <strong>Search queries</strong>, but not to <strong>URLs</strong>.",
505 |         "default": "",
506 |         "editor": "select",
507 |         "enum": [
508 |             "",
509 |             "af",
510 |             "sq",
511 |             "sm",
512 |             "ar",
513 |             "az",
514 |             "eu",
515 |             "be",
516 |             "bn",
517 |             "bh",
518 |             "bs",
519 |             "bg",
520 |             "ca",
521 |             "zh-CN",
522 |             "zh-TW",
523 |             "hr",
524 |             "cs",
525 |             "da",
526 |             "nl",
527 |             "en",
528 |             "eo",
529 |             "et",
530 |             "fo",
531 |             "fi",
532 |             "fr",
533 |             "fy",
534 |             "gl",
535 |             "ka",
536 |             "de",
537 |             "el",
538 |             "gu",
539 |             "iw",
540 |             "hi",
541 |             "hu",
542 |             "is",
543 |             "id",
544 |             "ia",
545 |             "ga",
546 |             "it",
547 |             "ja",
548 |             "jw",
549 |             "kn",
550 |             "ko",
551 |             "la",
552 |             "lv",
553 |             "lt",
554 |             "mk",
555 |             "ms",
556 |             "ml",
557 |             "mt",
558 |             "mr",
559 |             "ne",
560 |             "no",
561 |             "nn",
562 |             "oc",
563 |             "fa",
564 |             "pl",
565 |             "pt-BR",
566 |             "pt-PT",
567 |             "pa",
568 |             "ro",
569 |             "ru",
570 |             "gd",
571 |             "sr",
572 |             "si",
573 |             "sk",
574 |             "sl",
575 |             "es",
576 |             "su",
577 |             "sw",
578 |             "sv",
579 |             "tl",
580 |             "ta",
581 |             "te",
582 |             "th",
583 |             "ti",
584 |             "tr",
585 |             "uk",
586 |             "ur",
587 |             "uz",
588 |             "vi",
589 |             "cy",
590 |             "xh",
591 |             "zu"
592 |         ],
593 |         "enumTitles": [
594 |             "Default",
595 |             "Afrikaans",
596 |             "Albanian",
597 |             "Amharic",
598 |             "Arabic",
599 |             "Azerbaijani",
600 |             "Basque",
601 |             "Belarusian",
602 |             "Bengali",
603 |             "Bihari",
604 |             "Bosnian",
605 |             "Bulgarian",
606 |             "Catalan",
607 |             "Chinese (Simplified)",
608 |             "Chinese (Traditional)",
609 |             "Croatian",
610 |             "Czech",
611 |             "Danish",
612 |             "Dutch",
613 |             "English",
614 |             "Esperanto",
615 |             "Estonian",
616 |             "Faroese",
617 |             "Finnish",
618 |             "French",
619 |             "Frisian",
620 |             "Galician",
621 |             "Georgian",
622 |             "German",
623 |             "Greek",
624 |             "Gujarati",
625 |             "Hebrew",
626 |             "Hindi",
627 |             "Hungarian",
628 |             "Icelandic",
629 |             "Indonesian",
630 |             "Interlingua",
631 |             "Irish",
632 |             "Italian",
633 |             "Japanese",
634 |             "Javanese",
635 |             "Kannada",
636 |             "Korean",
637 |             "Latin",
638 |             "Latvian",
639 |             "Lithuanian",
640 |             "Macedonian",
641 |             "Malay",
642 |             "Malayam",
643 |             "Maltese",
644 |             "Marathi",
645 |             "Nepali",
646 |             "Norwegian",
647 |             "Norwegian (Nynorsk)",
648 |             "Occitan",
649 |             "Persian",
650 |             "Polish",
651 |             "Portuguese (Brazil)",
652 |             "Portuguese (Portugal)",
653 |             "Punjabi",
654 |             "Romanian",
655 |             "Russian",
656 |             "Scots Gaelic",
657 |             "Serbian",
658 |             "Sinhalese",
659 |             "Slovak",
660 |             "Slovenian",
661 |             "Spanish",
662 |             "Sudanese",
663 |             "Swahili",
664 |             "Swedish",
665 |             "Tagalog",
666 |             "Tamil",
667 |             "Telugu",
668 |             "Thai",
669 |             "Tigrinya",
670 |             "Turkish",
671 |             "Ukrainian",
672 |             "Urdu",
673 |             "Uzbek",
674 |             "Vietnamese",
675 |             "Welsh",
676 |             "Xhosa",
677 |             "Zulu"
678 |         ]
679 |     },
680 |     "locationUule": {
681 |         "title": "UULE location code",
682 |         "type": "string",
683 |         "description": "The code for geolocation of search results. It's passed to Google Search as the <code>uule</code> URL query parameter. You can use the <a href='https://padavvan.github.io/' target='_blank'>UULE code generator</a>. Learn more about <a href='https://moz.com/ugc/geolocation-the-ultimate-tip-to-emulate-local-search' target='_blank'>emulating local search</a>. <br><br>This setting only applies to <strong>Search queries</strong>, but not to <strong>URLs</strong>.",
684 |         "editor": "textfield"
685 |     },
686 |     "resultsPerPage": {
687 |         "title": "Results per page",
688 |         "type": "integer",
689 |         "description": "Number of search results per page. By default, Google Search returns 10 results. The allowed values are: <code>10</code>, <code>20</code>, <code>30</code>, <code>40</code>, <code>50</code> and <code>100</code>.<br><br>This setting only applies to <strong>Search queries</strong>, but not to <strong>URLs</strong>.",
690 |         "maximum": 100,
691 |         "minimum": 1
692 |     },
693 |     "maxPagesPerQuery": {
694 |         "title": "Max pages per query",
695 |         "type": "integer",
696 |         "description": "The maximum number of search result pages crawled for each search query or URL. Note that a value greater than one might significantly slow down the Actor.",
697 |         "default": 1,
698 |         "minimum": 1
699 |     },
700 |     "customDataFunction": {
701 |         "title": "Custom data function",
702 |         "type": "string",
703 |         "description": "Custom JavaScript function to extract additional attributes from the HTML of the result pages. The function accepts the same parameters as the <code>handlePageFunction</code> of the <a href='https://sdk.apify.com/docs/typedefs/cheerio-crawler-options'>CheerioCrawler</a> in Apify SDK. The return value of the function is saved to the results as the <code>customData</code> property.",
704 |         "editor": "javascript",
705 |         "prefill": "async ({ input, $, request, response, html }) => {\n  return {\n    pageTitle: $('title').text(),\n  };\n};"
706 |     },
707 |     "maxConcurrency": {
708 |         "title": "Max concurrency",
709 |         "type": "integer",
710 |         "description": "The maximum number of search results pages the crawler will load in parallel. A higher number means you will get your results faster, but also it will burn through your available proxies quicker.",
711 |         "default": 10,
712 |         "maximum": 100,
713 |         "minimum": 1
714 |     },
715 |     "saveHtml": {
716 |         "title": "Save HTML to dataset",
717 |         "type": "boolean",
718 |         "description": "If checked, the HTML of Google Search results pages will be stored to the default dataset, under the <code>html</code> property. This is useful if you need to process the HTML, but it makes the dataset large and reduces performance.",
719 |         "default": false,
720 |         "groupCaption": "Options"
721 |     },
722 |     "saveHtmlToKeyValueStore": {
723 |         "title": "Save HTML to key-value store",
724 |         "type": "boolean",
725 |         "description": "If checked, the HTML of the Google Search results pages will be stored to the default key-value store and links to the files stored to the dataset under the <code>htmlSnapshotUrl</code> property. This is useful for debugging, since you can easily view the the pages in browser, but use of this feature has some performance penalty.",
726 |         "default": false
727 |     },
728 |     "mobileResults": {
729 |         "title": "Mobile results",
730 |         "type": "boolean",
731 |         "description": "If checked, the crawler will return results for the mobile version of the Google Search. By default, desktop results are returned.",
732 |         "default": false
733 |     },
734 |     "includeUnfilteredResults": {
735 |         "title": "Include unfiltered results",
736 |         "type": "boolean",
737 |         "description": "If checked, the lower quality results that Google normally filters out will be included. Usually it is few hundred extra results.",
738 |         "default": false
739 |     }
740 | }
741 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/OUTPUT.json:
--------------------------------------------------------------------------------
 1 | // Example of OUTPUT.json file, automatically generated by system according to OUTPUT_SCHEMA.json
 2 | // This could be the result for run Actor sync/async API endpoint, in case there is no OUTPUT.json ?
 3 | 
 4 | {
 5 |     "searchResults": "https://api.apify.com/v2/datasets/[DEFAULT_DATASET_ID]/items?format=[FORMAT]",
 6 | 
 7 |     "pageHtmlSnapshots": "https://api.apify.com/v2/key-value-stores/[DEFAULT_DATASET_ID]/records?prefix=[PREFIX]",
 8 | 
 9 |     "pageScreenshots": "https://api.apify.com/v2/key-value-stores/[DEFAULT_DATASET_ID]/records?prefix=[FORMAT]",
10 | 
11 |     "demo": "liveviewUrl when Actor runs"
12 | }
13 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/OUTPUT_SCHEMA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // The system will generate OUTPUT.json file and save it there right away, according to the schema below,
 3 |     // so the consumers can read it right away after start of Actor. This is done before writing INPUT.json
 4 |     "searchResults": {
 5 |         "title": "Search results",
 6 |         "description": "The main results of the Actor, each record is one Google SERPs page.",
 7 |         "type": "dataset", // or "default-dataset"  ?
 8 |         // How to tell the system it should use default dataset, or named? Or what if Actor caller
 9 |         // could say which dataset/kv-store should be used for the run (new or existing with name),
10 |         // and "dataset" here would just mean that dataset produced by the Actor.
11 |         "default": true,
12 |         "schemaFile": "./schemas/GOOGLE_SERPS_DATASET_SCHEMA.json"
13 |     },
14 | 
15 |     "pageHtmlSnapshots": {
16 |         "title": "Page HTML snapshots",
17 |         "description": "Saved snapshots of the search result pages. Only available if enabled on input. Useful for testing, to review the original source.",
18 |         "type/source/target/location": "key-value-store", // or "default-key-value-store" ?
19 |         // Tells filter for the records to show/return, we'd have to add "perfix" query param to kv-store API
20 |         "prefix": "SNAPSHOT-html-",
21 |         // How to enforce???? Maybe future
22 |         "contentType": "image/png"
23 |     },
24 | 
25 |     "pageScreenshots": {
26 |         "title": "Page screenshots",
27 |         "description": "Saved screenshots of the search result pages. Only available if enabled on input. Useful for testing, to review the original source.",
28 |         "type": "key-value-store", // or "default-key-value-store" ?
29 |         "prefix": "SCREENSHOT-png-"
30 |     }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/schemas/GOOGLE_SERPS_DATASET_SCHEMA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Google Search Results",
 3 |   "description": "Data about Google Search results yadadada",
 4 | 
 5 |   // This is basically a dataset schema here (so maybe call it "schema"?):
 6 |   "fields": {
 7 |     "url": "string",
 8 |     "hasNextPage": "boolean",
 9 |     "resultsTotal": "number",
10 |     "relatedQueries": "array",
11 |     "searchQuery": {
12 |       "term": "string",
13 |       "page": "number",
14 |       "type": "string",
15 |       "domain": "string",
16 |       "countryCode": "string",
17 |       "languageCode": "string",
18 |       "locationUule": "string",
19 |       "resultsPerPage": "number"
20 |     },
21 |     "organicResults": "array",
22 |     "organicResults.$": {
23 |       // ... xxx
24 |     },
25 |     "paidResults": "array",
26 |     // Or use this syntax?
27 |     "searchQuery": "object",
28 |     "searchQuery.term": {
29 |       "type": "string",
30 |       "optional": true
31 |     },
32 |     "searchQuery.page": "number",
33 |     "searchQuery.type": "string",
34 |     "searchQuery.domain": "string",
35 |     "searchQuery.countryCode": "string",
36 |     "searchQuery.languageCode": "string",
37 |     "searchQuery.locationUule": "string",
38 |     "searchQuery.resultsPerPage": "number"
39 |   },
40 |   // This should tell the output consumers how to render or preview the data.
41 |   "views": {
42 |     "default": {
43 |       "name": "All SERPs grouped by page",
44 |       "fields": "searchQuery.term,organicResults",
45 |       "descending": true,
46 |       "format": "html" // Is this needed? The consumer of output should pick the format
47 |     },
48 |     "raw": {
49 |       "name": "All search results",
50 |       "unwind": "organicResults",
51 |       "fields": "searchQuery,organicResults"
52 |     }
53 |   },
54 | 
55 |   // Similar to prefill, not sure about this, who will fill this?
56 |   //  Maybe go back to example run, or just dataset URL?
57 |   "example": [{
58 |     "url": "http://google.com",
59 |     "searchQuery": "ffwef",
60 |     ...
61 |   }]
62 | }
63 | 


--------------------------------------------------------------------------------
/pages/tmp_schema_experiments/google_search_scraper/.ACTOR/schemas/GOOGLE_SERPS_SCREENSHOTS_KV_STORE_SCHEMA.json:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | // TODO: Does this make sense??? What for??? Dataset is different than KV-store...
4 | //  Maybe it would be useful that it would:
5 | //  1) Check the new records have a correct content-type when storing them and fail otherwise
6 | //  2) Somehow render the view differently???
7 | // IMHO boths cases are quite weak, plus we'd need to allow multiple such schemas per KV-store (unlike KV-store),
8 | // so I'd skip this.
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-frontmatter>=1.0.0
2 | markdown>=3.4.0
3 | regex>=2023.0.0
4 | 


--------------------------------------------------------------------------------
/scripts/md2mdx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Markdown to MDX Transformer
  5 | ---------------------------
  6 | 
  7 | This script transforms a standard Markdown file into an MDX file for use with Astro.
  8 | It performs several transformations:
  9 | 
 10 | 1. Processes ASTRO comments into component tags:
 11 |    - CodeSwitcher and CodeExample components.
 12 |    - Illustration, Diagram, and Picture components.
 13 |    - Removes redundant titles in code blocks.
 14 | 2. Transforms image references to Astro Picture components
 15 | 3. Removes table of contents (The Astro site has its own table of contents).
 16 | 4. Adds GitHub header.
 17 | 5. Removes bold formatting from fully bold lines.
 18 | 6. Transforms schema file links to proper paths.
 19 | 
 20 | Usage:
 21 |     The script reads from README.md in the project root and outputs to:
 22 |     sync/target/src/content/pages/index.mdx
 23 | 
 24 |     $ python3 scripts/md2mdx.py
 25 | 
 26 | Dependencies:
 27 |     - python-frontmatter
 28 |     - pathlib
 29 |     - re (regex)
 30 | """
 31 | 
 32 | import os
 33 | import sys
 34 | from pathlib import Path
 35 | import frontmatter
 36 | import re
 37 | import glob
 38 | import argparse
 39 | 
 40 | 
 41 | def parse_args():
 42 |     """Parse command line arguments."""
 43 | 
 44 |     parser = argparse.ArgumentParser(description='Transform Markdown files to MDX format.')
 45 |     parser.add_argument('--source', type=str, help='Source directory containing markdown files')
 46 |     parser.add_argument('--target', type=str, help='Target directory for MDX files')
 47 | 
 48 |     return parser.parse_args()
 49 | 
 50 | 
 51 | # Get the project root - need to handle both direct run and test-sync run.
 52 | SCRIPT_PATH = Path(__file__).resolve()
 53 | args = parse_args()
 54 | 
 55 | print(f'Script location: {__file__}')
 56 | 
 57 | if args.source and args.target:
 58 |     SOURCE_ROOT = Path(args.source).resolve()
 59 |     TARGET_ROOT = Path(args.target).resolve()
 60 | else:
 61 |     PROJECT_ROOT = SCRIPT_PATH.parent.parent
 62 |     SOURCE_ROOT = PROJECT_ROOT / 'sync/source'
 63 |     TARGET_ROOT = PROJECT_ROOT / 'sync/target'
 64 | 
 65 | print(f'Source root: {SOURCE_ROOT}')
 66 | print(f'Target root: {TARGET_ROOT}')
 67 | 
 68 | # Required imports for the MDX file.
 69 | ASTRO_IMPORTS = '''import { Picture } from 'astro:assets';
 70 | import CodeExample from '../../components/CodeExample.astro';
 71 | import CodeSwitcher from '../../components/CodeSwitcher.astro';
 72 | import Diagram from '../../components/Diagram.astro';
 73 | import GitHubHeader from '../../components/GitHubHeader.astro';
 74 | import illuApifyStore from './illu-apify-store@2x.png';
 75 | import illuAPIGetInput from './illu-get-input@2x.png';
 76 | import illuAPIKeyValueStoreAccess from './illu-api-key-value-store-access@2x.png';
 77 | import illuAPIMetamorph from './illu-api-metamorph@2x.gif';
 78 | import illuAPIPush from './illu-api-push@2x.gif';
 79 | import illuAPIReboot from './illu-api-reboot@2x.png';
 80 | import illuAPIStartAnother from './illu-api-start-another@2x.png';
 81 | import illuAPIWebServer from './illu-api-webserver@2x.gif';
 82 | import illuBasicConceptsInput from './illu-basic-concepts-input@2x.gif';
 83 | import illuBasicConceptsIntegrations from './illu-basic-concepts-integrations@2x.png';
 84 | import illuBasicConceptsOutput from './illu-basic-concepts-output@2x.gif';
 85 | import illuBasicConceptsRunEnvironment from './illu-basic-concepts-docker@2x.gif';
 86 | import illuBasicConceptsStorage from './illu-basic-concepts-storage@2x.png';
 87 | import illuBasicConceptsStorageDataset from './illu-basic-concepts-storage-dataset@2x.png';
 88 | import illuBasicConceptsStorageKeyValueStore from './illu-basic-concepts-storage-key-value-store@2x.png';
 89 | import illuDatasetSchema from './illu-dataset-schema@2x.png';
 90 | import illuDefinitionFilesInputSchemaFile from './illu-definition-files-input-schema-file@2x.png';
 91 | import illuDefinitionFilesOutputSchemaFile from './illu-definition-files-output-schema-file@2x.png';
 92 | import illuDevelopmentDeployment from './illu-development-deployment@2x.png';
 93 | import illuDevelopmentLocal from './illu-development-local@2x.png';
 94 | import illuDiagramHoriz from './illu-diagram-horiz@2x.png';
 95 | import illuDiagramVert from './illu-diagram-vert@2x.png';
 96 | import illuPhilosophyWhyTheName from './illu-philosophy-why-the-name@2x.png';
 97 | import illuSharingChargingMoney from './illu-sharing-charging-money@2x.gif';
 98 | import illuSharingMonetization from './illu-sharing-monetization@2x.png';
 99 | import Illustration from '../../components/Illustration.astro';
100 | import illuTakerInput from './illu-taker-input@2x.png';'''
101 | 
102 | 
103 | IGNORED_FILES = {
104 |     'license.md',  # ignore case-insensitive
105 |     # Add more files here as needed, e.g.:
106 |     # 'contributing.md',
107 |     # 'changelog.md',
108 | }
109 | 
110 | 
111 | def should_process_file(path: Path) -> bool:
112 |     """Determine if a file should be processed based on ignore rules."""
113 | 
114 |     # Case-insensitive filename check.
115 |     if path.name.lower() in IGNORED_FILES:
116 |         print(f'\n󰋼  Skipping ignored file: {path.name}')
117 |         return False
118 |     return True
119 | 
120 | 
121 | def remove_table_of_contents(content: str) -> str:
122 |     """Remove the table of contents section from the markdown content."""
123 | 
124 |     print('\n󰋼  Removing table of contents...')
125 | 
126 |     def replace_toc(match):
127 |         print('  ⭮  Removed table of contents section')
128 |         return ''
129 | 
130 |     return re.sub(
131 |         r'## Contents\n\n<!-- toc -->[\s\S]*?<!-- tocstop -->',
132 |         replace_toc,
133 |         content
134 |     )
135 | 
136 | 
137 | def transform_image_references(content: str) -> str:
138 |     """Transform markdown image references to Astro Picture components."""
139 | 
140 |     print('\n󰋼  Transforming image references...')
141 | 
142 |     def replace_image(match):
143 |         alt, src = match.groups()
144 |         print(f'  ⭮  {src}')
145 |         basename = os.path.basename(src)
146 |         return f'<Picture src={basename} alt="{alt}" width={800} height={600} />'
147 | 
148 |     return re.sub(
149 |         r'!\[(.*?)\]\((.*?)\)',
150 |         replace_image,
151 |         content
152 |     )
153 | 
154 | 
155 | def add_github_header(content: str, is_readme: bool) -> str:
156 |     """Add GitHub header component after the first heading, but only for README.md."""
157 | 
158 |     if not is_readme:
159 |         return content
160 | 
161 |     print('\n󰋼  Adding GitHub header...')
162 |     print('  ⭮  Adding GitHub header')
163 |     return re.sub(
164 |         r'(#\s+[^\n]*\n)(\n?)',
165 |         r'\1\n<GitHubHeader repoUrl="https://github.com/apify/actor-whitepaper" />\n\n',
166 |         content,
167 |         count=1
168 |     )
169 | 
170 | 
171 | def remove_bold_formatting(content: str) -> str:
172 |     """Remove bold formatting from lines that are entirely bold."""
173 | 
174 |     print('\n󰋼  Removing bold formatting...')
175 | 
176 |     def replace_bold(match):
177 |         text = match.group(1)
178 |         print(f'  ⭮  {text[:120]}')
179 |         return text
180 | 
181 |     return re.sub(
182 |         r'^\*\*(.*?)\*\*$',
183 |         replace_bold,
184 |         content,
185 |         flags=re.MULTILINE
186 |     )
187 | 
188 | 
189 | def remove_picture_components(content: str) -> str:
190 |     """Remove Picture components that aren't preceded by ASTRO comments."""
191 | 
192 |     print('\n󰋼  Removing Picture components...')
193 | 
194 |     def replace_picture(match):
195 |         picture = re.sub(r'\s+', ' ', match.group(0))
196 |         print(f'  ⭮  {picture[:120]}')
197 |         return ''
198 | 
199 |     return re.sub(
200 |         r'(?<!<!-- ASTRO: )<Picture.*?/>',
201 |         replace_picture,
202 |         content,
203 |         flags=re.MULTILINE | re.DOTALL
204 |     )
205 | 
206 | 
207 | def transform_astro_blocks(content: str) -> str:
208 |     """Transform ASTRO comments into component tags.
209 | 
210 |     This function processes:
211 |     1. CodeSwitcher and CodeExample components, removing redundant titles.
212 |     2. Illustration, Diagram and Picture components.
213 |     """
214 | 
215 |     print('\n󰋼  Transforming ASTRO blocks...')
216 | 
217 |     def replace_astro_block(match):
218 |         # Get the component definition but preserve internal whitespace.
219 |         component = match.group(1).strip()
220 | 
221 |         # Handle CodeSwitcher tags.
222 |         if component == '<CodeSwitcher>':
223 |             print('  ⭮  Adding CodeSwitcher opening tag')
224 |             return '<CodeSwitcher>'
225 |         elif component == '</CodeSwitcher>':
226 |             print('  ⭮  Adding CodeSwitcher closing tag')
227 |             return '</CodeSwitcher>'
228 | 
229 |         # Handle CodeExample tags with titles.
230 |         code_example_match = re.match(r'<CodeExample\s+title="([^"]+)">', component)
231 | 
232 |         if code_example_match:
233 |             title = code_example_match.group(1)
234 |             print(f'  ⭮  Adding CodeExample tag with title: {title}')
235 |             return f'<CodeExample title="{title}">'
236 |         elif component == '</CodeExample>':
237 |             print('  ⭮  Adding CodeExample closing tag')
238 |             return '</CodeExample>'
239 | 
240 |         # Handle media components (Illustration, Diagram, Picture).
241 |         if (component.startswith('<Illustration') or
242 |             component.startswith('<Diagram') or
243 |             component.startswith('<Picture')):
244 |             print(f'  ⭮  {component[:120]}')
245 |             return component
246 | 
247 |         # Return unchanged if not a matching component.
248 |         return f'<!-- ASTRO: {match.group(1)} -->'
249 | 
250 |     # First transform all ASTRO comments to their respective components.
251 |     content = re.sub(
252 |         r'<!--\s*ASTRO:\s*(.*?)\s*-->',
253 |         replace_astro_block,
254 |         content,
255 |         flags=re.MULTILINE | re.DOTALL
256 |     )
257 | 
258 |     # Then remove redundant h3/h4 titles that appear right after CodeExample tags.
259 |     def remove_redundant_titles(match):
260 |         block = match.group(0)
261 | 
262 |         # Match any h3 or h4 heading after the opening tag, including across newlines.
263 |         block = re.sub(
264 |             r'(<CodeExample[^>]+>)(\s*\n)*\s*#{3,4}[^\n]+\n',
265 |             lambda m: print('  ⭮  Removing heading after CodeExample') or m.group(1) + '\n',
266 |             block,
267 |             count=1  # Only remove the first heading found
268 |         )
269 | 
270 |         return block
271 | 
272 |     # Process each CodeExample block to remove redundant titles.
273 |     content = re.sub(
274 |         r'<CodeExample[^>]+>[\s\S]+?</CodeExample>',
275 |         remove_redundant_titles,
276 |         content
277 |     )
278 | 
279 |     return content
280 | 
281 | 
282 | def transform_schema_links(content: str) -> str:
283 |     """Transform schema file links to their proper paths."""
284 | 
285 |     print('\n󰋼  Transforming schema links...')
286 | 
287 |     def replace_link(match, suffix_lower):
288 |         text, path = match.groups()
289 |         new_path = f'/{path.lower().replace("_", "-")}-{suffix_lower}'
290 |         print(f'  ⭮  {text} → {new_path}')
291 |         return f'[{text}]({new_path})'
292 | 
293 |     # Define patterns for both schema and file links.
294 |     replacements = {
295 |         r'\[([^]]+)\]\(./pages/([^)]+)_SCHEMA\.md\)':
296 |             lambda m: replace_link(m, 'schema'),
297 |         r'\[([^]]+)\]\(./pages/([^)]+)_FILE\.md\)':
298 |             lambda m: replace_link(m, 'file')
299 |     }
300 | 
301 |     # Apply each replacement pattern.
302 |     for pattern, replacement in replacements.items():
303 |         content = re.sub(pattern, replacement, content)
304 | 
305 |     return content
306 | 
307 | 
308 | def remove_html_comments(content: str) -> str:
309 |     """Remove all HTML comments from the content."""
310 | 
311 |     print('\n󰋼  Removing HTML comments...')
312 | 
313 |     def replace_comment(match):
314 |         comment = match.group(0)
315 |         print(f'  ⭮  Removing comment: {comment[:120]}')
316 |         return ''
317 | 
318 |     return re.sub(
319 |         r'<!--.*?-->',
320 |         replace_comment,
321 |         content,
322 |         flags=re.MULTILINE | re.DOTALL
323 |     )
324 | 
325 | 
326 | def transform_internal_links(content: str) -> str:
327 |     """Transform internal markdown links to the new MDX format."""
328 | 
329 |     print('\n󰋼  Transforming internal links...')
330 | 
331 |     def format_link_text(text):
332 |         """Convert technical names to readable titles."""
333 | 
334 |         # Remove file extensions.
335 |         text = re.sub(r'\.(json|md)$', '', text)
336 | 
337 |         # Handle special cases.
338 |         if text == 'README':
339 |             return 'Documentation'
340 | 
341 |         # Convert UPPER_CASE to Title Case.
342 |         if text.isupper():
343 |             words = text.split('_')
344 |             return ' '.join(word.capitalize() for word in words)
345 | 
346 |         return text
347 | 
348 |     def replace_link(match):
349 |         text, path, anchor = match.groups()
350 | 
351 |         # Handle different link types.
352 |         if path:
353 |             # Remove .md extension if present.
354 |             path = path.replace('.md', '')
355 | 
356 |             if path == '../README':
357 |                 # Links to README become root links.
358 |                 new_path = '/'
359 |             else:
360 |                 # Remove ./ or / prefix if present.
361 |                 path = path.lstrip('./').lstrip('/')
362 | 
363 |                 # Convert to kebab case.
364 |                 new_path = '/' + path.lower().replace('_', '-')
365 |         else:
366 |             new_path = ''
367 | 
368 |         # Add anchor if present.
369 |         if anchor:
370 |             new_path = f"{new_path}{anchor}"
371 | 
372 |         # Format the link text if it's a technical name.
373 |         if text.endswith('.md') or text.endswith('.json') or text.isupper() or '.json' in text:
374 |             text = format_link_text(text)
375 | 
376 |         print(f'  ⭮  {text} → {new_path}')
377 |         return f'[{text}]({new_path})'
378 | 
379 |     # First pass: handle standard markdown links.
380 |     pattern = r'\[([^\]]+)\]\(((?!http)[^)#\s]+)?([#][^)\s]+)?\)'
381 |     content = re.sub(pattern, replace_link, content)
382 | 
383 |     # Second pass: handle already transformed links but with technical names.
384 |     pattern = r'\[([A-Z_]+(?:\.(?:json|md))?)\](/[a-z-]+(?:[#][^)\s]+)?)\)'
385 |     content = re.sub(pattern, lambda m: f'[{format_link_text(m.group(1))}]{m.group(2)}', content)
386 | 
387 |     return content
388 | 
389 | 
390 | def remove_img_tags(content: str) -> str:
391 |     """Remove HTML img tags from the content."""
392 | 
393 |     print('\n󰋼  Removing img tags...')
394 | 
395 |     def replace_img(match):
396 |         img = match.group(0)
397 |         print(f'  ⭮  Removing img tag: {img[:120]}')
398 |         return ''
399 | 
400 |     return re.sub(
401 |         r'<img[^>]+>',
402 |         replace_img,
403 |         content
404 |     )
405 | 
406 | 
407 | def transform_inline_references(content: str) -> str:
408 |     """Transform inline file references and URLs to proper format."""
409 | 
410 |     print('\n󰋼  Transforming inline references...')
411 | 
412 |     def replace_reference(match):
413 |         path = match.group(1)
414 | 
415 |         # Remove .md extension if present.
416 |         path = path.replace('.md', '')
417 | 
418 |         # Convert to kebab case and add leading slash.
419 |         new_path = '/' + path.lstrip('./').lstrip('/').lower().replace('_', '-')
420 | 
421 |         print(f'  ⭮  {path} → {new_path}')
422 | 
423 |         return new_path
424 | 
425 |     # Transform file references like ./DATASET_SCHEMA.md to /dataset-schema.
426 |     content = re.sub(
427 |         r'(?<=See )\.?/?([A-Z_]+\.md)',
428 |         replace_reference,
429 |         content
430 |     )
431 | 
432 |     # Transform TODO references using a separate pattern.
433 |     pattern = r"([A-Z_]+\.md)"
434 |     content = re.sub(
435 |         r'Move to ([A-Z_]+\.md)',
436 |         lambda m: f'Move to {replace_reference(re.match(pattern, m.group(1)))}',
437 |         content
438 |     )
439 | 
440 |     return content
441 | 
442 | 
443 | def transform_markdown_to_mdx(content: str, source_file: Path) -> str:
444 |     """Main transformation pipeline to convert markdown to MDX format."""
445 | 
446 |     print('\n󰋼  Parsing frontmatter...')
447 |     post = frontmatter.loads(content)
448 | 
449 |     is_readme = source_file.name.lower() == 'readme.md'
450 | 
451 |     # Apply transformations in sequence.
452 |     transformed = remove_table_of_contents(post.content)
453 |     transformed = transform_image_references(transformed)
454 |     transformed = remove_picture_components(transformed)
455 |     transformed = transform_astro_blocks(transformed)
456 |     transformed = add_github_header(transformed, is_readme)
457 |     transformed = remove_bold_formatting(transformed)
458 |     transformed = transform_schema_links(transformed)
459 |     transformed = transform_internal_links(transformed)
460 |     transformed = transform_inline_references(transformed)
461 |     transformed = remove_html_comments(transformed)
462 |     transformed = remove_img_tags(transformed)
463 | 
464 |     print('\n󰋼  Combining with Astro imports...')
465 |     return f'{ASTRO_IMPORTS}\n\n{transformed}'
466 | 
467 | 
468 | def get_target_path(source_path: Path) -> Path:
469 |     """Convert source path to target path using the required transformations."""
470 | 
471 |     # Get relative path from source root.
472 |     rel_path = source_path.relative_to(SOURCE_ROOT)
473 | 
474 |     # Transform filename.
475 |     stem = rel_path.stem.lower().replace('_', '-')
476 |     new_name = f"{stem}.mdx"
477 | 
478 |     # Construct target path.
479 |     if source_path.name == 'README.md':
480 |         # Special case for README.md -> index.mdx.
481 |         return TARGET_ROOT / 'src/content/pages/index.mdx'
482 |     else:
483 |         # For files in pages directory.
484 |         return TARGET_ROOT / 'src/content/pages' / new_name
485 | 
486 | 
487 | def process_files():
488 |     """Main function to process all markdown files."""
489 | 
490 |     try:
491 |         # Find all markdown files to process.
492 |         source_files = [
493 |             Path(p) for p in [
494 |                 *glob.glob(str(SOURCE_ROOT / '*.md')),  # root md files
495 |                 *glob.glob(str(SOURCE_ROOT / 'pages/*.md'))  # files in pages directory
496 |             ]
497 | 
498 |             if should_process_file(Path(p))  # filter out ignored files
499 |         ]
500 | 
501 |         print(f'\n󰋼  Found {len(source_files)} markdown files to process')
502 | 
503 |         for source_file in source_files:
504 |             target_file = get_target_path(source_file)
505 |             print(f'\n󰋼  Processing: {source_file.name} → {target_file.name}')
506 | 
507 |             # Read source content.
508 |             print(f'  Reading source file: {source_file}')
509 |             with open(source_file, 'r', encoding='utf-8') as f:
510 |                 content = f.read()
511 |             print(f'  Source file size: {len(content)} bytes')
512 | 
513 |             # Transform content.
514 |             print('\n󰋼  Transforming content...')
515 |             transformed_content = transform_markdown_to_mdx(content, source_file)
516 |             print(f'  ⭮  {len(transformed_content)} bytes')
517 | 
518 |             # Write target file.
519 |             print(f'\n󰋼  Writing target file: {target_file}')
520 |             os.makedirs(target_file.parent, exist_ok=True)
521 |             with open(target_file, 'w', encoding='utf-8') as f:
522 |                 f.write(transformed_content)
523 |             print(f'  ⭮  {source_file.name} → {target_file.name}')
524 | 
525 |         print('\n󰋼  Formatting MDX files...')
526 |         os.system('npm run format-sync')
527 | 
528 |         print('\n  Done')
529 | 
530 |     except Exception as error:
531 |         print('\n❌ Error processing files:', str(error))
532 |         sys.exit(1)
533 | 
534 | 
535 | if __name__ == '__main__':
536 |     process_files()
537 | 


--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "󰋼  Setting up development environment..."
 4 | 
 5 | # Create virtual environment if it doesn't exist.
 6 | if [ ! -d ".venv" ]; then
 7 |     echo -e "\n󰋼  Creating Python virtual environment..."
 8 |     python3 -m venv .venv
 9 | fi
10 | 
11 | # Activate virtual environment.
12 | echo -e "\n󰋼  Activating virtual environment..."
13 | source .venv/bin/activate || source .venv/Scripts/activate
14 | 
15 | # Install/upgrade pip and dependencies.
16 | echo -e "\n󰋼  Installing Python dependencies..."
17 | python3 -m pip install --upgrade pip
18 | python3 -m pip install -r requirements.txt
19 | 
20 | # Make scripts executable.
21 | echo -e "\n󰋼  Making scripts executable..."
22 | chmod +x scripts/*.sh
23 | 
24 | echo -e "\n󰋼  Setup complete! Activate the virtual environment with:"
25 | echo "source .venv/bin/activate # Unix/Mac"
26 | echo ".venv\\Scripts\\activate # Windows"
27 | 


--------------------------------------------------------------------------------
/scripts/test-sync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORK_DIR="sync"
 4 | 
 5 | echo "󰋼  Starting sync test..."
 6 | 
 7 | rm -rf $WORK_DIR
 8 | mkdir -p $WORK_DIR
 9 | cd $WORK_DIR
10 | 
11 | if [ ! -d "source" ]; then
12 |     mkdir source
13 |     cp -r ../{pages,*.md} source/
14 |     echo -e "\n  Source files copied"
15 | else
16 |     echo -e "\n󰋼  Using existing source directory"
17 | fi
18 | 
19 | echo -e "\n\n"
20 | 
21 | if [ ! -d "target" ]; then
22 |     git clone https://github.com/apify/actor-whitepaper-web target
23 |     echo -e "\n  Target repository cloned"
24 | else
25 |     echo -e "\n󰋼  Using existing target directory"
26 | fi
27 | 
28 | echo -e "\n\n"
29 | 
30 | cd target
31 | git pull origin main
32 | echo -e "\n  Target repository updated"
33 | 
34 | echo -e "\n\n"
35 | 
36 | cd ../..
37 | 
38 | if [ ! -d ".venv" ]; then
39 |     python3 -m venv .venv
40 |     echo -e "\n  Python virtual environment created"
41 | fi
42 | 
43 | echo "Current path: $(pwd)"
44 | source .venv/bin/activate
45 | 
46 | echo -e "\n\n"
47 | 
48 | python3 -m pip install --upgrade pip
49 | python3 -m pip install -r ./requirements.txt
50 | echo -e "\n  Python dependencies installed"
51 | 
52 | echo -e "\n\n"
53 | echo "Current path: $(pwd)"
54 | python3 scripts/md2mdx.py --source $WORK_DIR/source --target $WORK_DIR/target
55 | echo -e "\n  MD to MDX conversion completed"
56 | 
57 | echo -e "\n\n"
58 | 
59 | cd target
60 | git status
61 | echo -e "\n  Target repository status checked"
62 | cd ..
63 | 
64 | echo -e "\n\n"
65 | 
66 | echo -e "\n  Check changes in ${WORK_DIR}/target"
67 | 
68 | deactivate
69 | 
70 | echo -e "\n󰋼  Done"
71 | 


--------------------------------------------------------------------------------
/sync-pr-template.txt:
--------------------------------------------------------------------------------
1 | Automated sync of Whitepaper content to MDX format.
2 | 
3 | This PR was automatically generated by the sync workflow at https://github.com/apify/actor-whitepaper.
4 | 
5 | - [ ] Review content changes
6 | - [ ] Check MDX formatting
7 | 


--------------------------------------------------------------------------------