├── .editorconfig ├── .eslintrc.json ├── .gitattributes ├── .github ├── funding.yml └── workflows │ ├── ci.yaml │ └── legacy-proxy.yml ├── .gitignore ├── .npmrc ├── legacy-proxy ├── Dockerfile ├── nginx.conf └── readme.md ├── license ├── package.json ├── readme.md └── src ├── fetch.js ├── linked-data ├── .bin │ └── .gitignore └── zhv-de │ ├── index.sh │ └── mapping.ttl ├── run.sh └── scrapers ├── hvv.js ├── luxembourg.js └── opendata-oepnv.js /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = tab 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.{yml,yaml}] 12 | indent_style = space 13 | indent_size = 2 14 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "standard", 3 | "rules": { 4 | "comma-dangle": [ 5 | "error", 6 | "always-multiline" 7 | ], 8 | "indent": ["error", "tab"], 9 | "no-tabs": "off" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | github: [juliuste] 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | pull_request: 5 | schedule: 6 | # keep request limits in mind before increasing the cron frequency 7 | # * is a special character in YAML so you have to quote this string 8 | - cron: '0 2 * * *' 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout main 14 | uses: actions/checkout@v3 15 | - uses: pnpm/action-setup@v2 16 | name: Set up pnpm 17 | with: 18 | version: 7 19 | - name: Set up Node.js 18 20 | uses: actions/setup-node@v3 21 | with: 22 | node-version: 18 23 | - name: Install dependencies 24 | run: pnpm install 25 | - name: Run tests 26 | run: pnpm test 27 | env: 28 | CI: true 29 | 30 | fetch-and-publish: 31 | runs-on: ubuntu-latest 32 | needs: test 33 | if: github.ref == 'refs/heads/main' 34 | steps: 35 | - name: Checkout main 36 | uses: actions/checkout@v3 37 | - name: Set up Python 3 38 | uses: actions/setup-python@v4 39 | with: 40 | python-version: 3 41 | - name: Install awscli 42 | run: pip3 install awscli awscli-plugin-endpoint 43 | 44 | - name: Set up awscli configuration 45 | env: 46 | DO_SPACES_KEY: ${{ secrets.DO_SPACES_KEY }} 47 | DO_SPACES_SECRET: ${{ secrets.DO_SPACES_SECRET }} 48 | run: | 49 | set -e; 50 | mkdir ~/.aws; 51 | echo " 52 | [default] 53 | aws_access_key_id=$DO_SPACES_KEY 54 | aws_secret_access_key=$DO_SPACES_SECRET 55 | " > ~/.aws/credentials; 56 | echo " 57 | [plugins] 58 | endpoint = awscli_plugin_endpoint 59 | [default] 60 | region = fr-par 61 | s3 = 62 | endpoint_url = https://fra1.digitaloceanspaces.com 63 | signature_version = s3v4 64 | max_concurrent_requests = 100 65 | max_queue_size = 2000 66 | multipart_threshold = 2000MB 67 | # Edit the multipart_chunksize value according to the file sizes that you want to upload. The present configuration allows to upload files up to 10 GB (100 requests * 10MB). For example setting it to 5GB allows you to upload files up to 5TB. 68 | multipart_chunksize = 2000MB 69 | s3api = 70 | endpoint_url = https://fra1.digitaloceanspaces.com 71 | " > ~/.aws/config; 72 | 73 | - name: Ensure that bucket versioning is enabled 74 | env: 75 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 76 | run: aws s3api put-bucket-versioning --bucket $DO_SPACES_BUCKET_NAME --versioning-configuration 'Status=Enabled' 77 | 78 | - uses: pnpm/action-setup@v2 79 | name: Set up pnpm 80 | with: 81 | version: 7 82 | - name: Set up Node.js 18 83 | uses: actions/setup-node@v3 84 | with: 85 | node-version: 18 86 | - name: Install dependencies 87 | run: pnpm install 88 | 89 | - name: "Fetch and upload feed: DE_GTFS" 90 | if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing 91 | env: 92 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 93 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 94 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 95 | run: | 96 | set -e; 97 | npm run fetch-de-gtfs; 98 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/gtfs.zip || echo 'not-yet-existing'); 99 | newhash=$(cat de-gtfs.zip | md5sum); 100 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 101 | then 102 | aws s3 cp --acl public-read de-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/gtfs.zip 103 | else 104 | echo 'file unchanged, skipping.' 105 | fi; 106 | 107 | - name: "Fetch and upload feed: DE_NETEX" 108 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 109 | env: 110 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 111 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 112 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 113 | run: | 114 | set -e; 115 | npm run fetch-de-netex; 116 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/netex.zip || echo 'not-yet-existing'); 117 | newhash=$(cat de-netex.zip | md5sum); 118 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 119 | then 120 | aws s3 cp --acl public-read de-netex.zip s3://"$DO_SPACES_BUCKET_NAME"/de/netex.zip 121 | else 122 | echo 'file unchanged, skipping.' 123 | fi; 124 | 125 | - name: "Fetch and upload feed: DE_ZHV" 126 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 127 | env: 128 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 129 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 130 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 131 | run: | 132 | set -e; 133 | npm run fetch-de-zhv; 134 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/zhv.zip || echo 'not-yet-existing'); 135 | newhash=$(cat de-zhv.zip | md5sum); 136 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 137 | then 138 | aws s3 cp --acl public-read de-zhv.zip s3://"$DO_SPACES_BUCKET_NAME"/de/zhv.zip 139 | else 140 | echo 'file unchanged, skipping.' 141 | fi; 142 | 143 | - name: "Generate linked data: DE_ZHV" 144 | # todo: do not run if processing the dataset failed 145 | if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing 146 | env: 147 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 148 | run: | 149 | set -e; 150 | ./src/linked-data/zhv-de/index.sh 151 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/zhv.ttl.gz || echo 'not-yet-existing'); 152 | newhash=$(cat ./src/linked-data/zhv-de/data/output.ttl.gz | md5sum); 153 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 154 | then 155 | aws s3 cp --acl public-read ./src/linked-data/zhv-de/data/output.ttl.gz s3://"$DO_SPACES_BUCKET_NAME"/de/zhv.ttl.gz 156 | else 157 | echo 'file unchanged, skipping.' 158 | fi; 159 | 160 | - name: "Fetch and upload feed: DE_NRW_GTFS" 161 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 162 | env: 163 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 164 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 165 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 166 | run: | 167 | set -e; 168 | npm run fetch-de-nrw-gtfs; 169 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/nrw-gtfs.zip || echo 'not-yet-existing'); 170 | newhash=$(cat de-nrw-gtfs.zip | md5sum); 171 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 172 | then 173 | aws s3 cp --acl public-read de-nrw-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/nrw-gtfs.zip 174 | else 175 | echo 'file unchanged, skipping.' 176 | fi; 177 | 178 | - name: "Fetch and upload feed: DE_HVV_GTFS" 179 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 180 | env: 181 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 182 | run: | 183 | set -e; 184 | npm run fetch-de-hvv-gtfs; 185 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/hvv-gtfs.zip || echo 'not-yet-existing'); 186 | newhash=$(cat de-hvv-gtfs.zip | md5sum); 187 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 188 | then 189 | aws s3 cp --acl public-read de-hvv-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/hvv-gtfs.zip 190 | else 191 | echo 'file unchanged, skipping.' 192 | fi; 193 | 194 | - name: "Fetch and upload feed: LU_GTFS" 195 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 196 | env: 197 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 198 | run: | 199 | set -e; 200 | npm run fetch-lu-gtfs; 201 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key lu/gtfs.zip || echo 'not-yet-existing'); 202 | newhash=$(cat lu-gtfs.zip | md5sum); 203 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 204 | then 205 | aws s3 cp --acl public-read lu-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/lu/gtfs.zip 206 | else 207 | echo 'file unchanged, skipping.' 208 | fi; 209 | 210 | - name: "Fetch and upload feed: SE_GTFS" 211 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 212 | env: 213 | TRAFIKLAB_API_KEY: ${{ secrets.TRAFIKLAB_API_KEY }} 214 | DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }} 215 | run: | 216 | set -e; 217 | npm run fetch-se-gtfs; 218 | currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key se/gtfs.zip || echo 'not-yet-existing'); 219 | newhash=$(cat se-gtfs.zip | md5sum); 220 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 221 | then 222 | aws s3 cp --acl public-read se-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/se/gtfs.zip 223 | else 224 | echo 'file unchanged, skipping.' 225 | fi; 226 | -------------------------------------------------------------------------------- /.github/workflows/legacy-proxy.yml: -------------------------------------------------------------------------------- 1 | name: Legacy proxy 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | build-and-push: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Set up QEMU 12 | uses: docker/setup-qemu-action@v3 13 | - name: Set up Docker Buildx 14 | uses: docker/setup-buildx-action@v3 15 | - name: Login to GitHub Container Registry 16 | uses: docker/login-action@v3 17 | with: 18 | registry: ghcr.io 19 | username: ${{ github.repository_owner }} 20 | password: ${{ secrets.GITHUB_TOKEN }} 21 | - name: Fetch commit hash 22 | id: hash 23 | run: echo "::set-output name=hash::$(echo $GITHUB_SHA | head -c7)" 24 | - name: Fetch current date and time 25 | id: datetime 26 | run: echo "::set-output name=datetime::$(date -u +'%Y-%m-%dT%H.%M.%SZ')" 27 | - name: Build and push 28 | uses: docker/build-push-action@v3 29 | with: 30 | file: ./legacy-proxy/Dockerfile 31 | platforms: linux/amd64,linux/arm64 32 | push: true 33 | tags: ghcr.io/${{github.repository}}-legacy-proxy:v1_${{steps.hash.outputs.hash}}_${{steps.datetime.outputs.datetime}} 34 | cache-from: type=gha 35 | cache-to: type=gha,mode=max 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # general 2 | .DS_Store 3 | *.log 4 | 5 | # node-specific 6 | node_modules 7 | package-lock.json 8 | yarn.lock 9 | shrinkwrap.yaml 10 | pnpm-lock.yaml 11 | dist 12 | 13 | data 14 | *.zip 15 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /legacy-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:alpine 2 | 3 | WORKDIR /app 4 | 5 | COPY legacy-proxy/nginx.conf /etc/nginx/conf.d/default.conf 6 | -------------------------------------------------------------------------------- /legacy-proxy/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 3000; 3 | rewrite /gtfs-germany.zip https://scraped.data.public-transport.earth/de/gtfs.zip permanent; 4 | rewrite /netex-germany.zip https://scraped.data.public-transport.earth/de/netex.zip permanent; 5 | rewrite /zhv.zip https://scraped.data.public-transport.earth/de/zhv.zip permanent; 6 | } 7 | -------------------------------------------------------------------------------- /legacy-proxy/readme.md: -------------------------------------------------------------------------------- 1 | # Legacy proxy 2 | 3 | The docker image defined in this directory is deployed to our [shared infrastructure](https://github.com/public-transport/infrastructure) and exposes a legacy proxy which forwards all requests to old dataset URLs to their latest equivalent. 4 | 5 | This service only exists for backwards compatibility and might be shut down in the future, users should always use the latest endpoints in their projects. 6 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | Data provided under separate licenses, refer to readme.md for a full list. 4 | 5 | # Code 6 | 7 | Copyright (c) 2023, Julius Tens 8 | 9 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "public-transport-data-scraper", 3 | "version": "0.0.0", 4 | "private": true, 5 | "description": "Scraper that re-publishes official german public transport datasets with stable URLs.", 6 | "homepage": "https://github.com/juliuste/public-transport-data-scraper", 7 | "bugs": "https://github.com/juliuste/public-transport-data-scraper/issues", 8 | "repository": "juliuste/public-transport-data-scraper", 9 | "license": "ISC", 10 | "author": "Julius Tens ", 11 | "type": "module", 12 | "scripts": { 13 | "check-deps": "depcheck", 14 | "fix": "npm run lint -- --fix", 15 | "lint": "eslint src", 16 | "fetch-de-gtfs": "DATASET=DE_GTFS FILE_NAME=de-gtfs.zip MINIMUM_SIZE_MB=100 ./src/run.sh", 17 | "fetch-de-hvv-gtfs": "DATASET=DE_HVV_GTFS FILE_NAME=de-hvv-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh", 18 | "fetch-de-netex": "DATASET=DE_NETEX FILE_NAME=de-netex.zip MINIMUM_SIZE_MB=500 ./src/run.sh", 19 | "fetch-de-nrw-gtfs": "DATASET=DE_NRW_GTFS FILE_NAME=de-nrw-gtfs.zip MINIMUM_SIZE_MB=30 ./src/run.sh", 20 | "fetch-de-zhv": "DATASET=DE_ZHV FILE_NAME=de-zhv.zip MINIMUM_SIZE_MB=10 ./src/run.sh", 21 | "fetch-lu-gtfs": "DATASET=LU_GTFS FILE_NAME=lu-gtfs.zip MINIMUM_SIZE_MB=3 ./src/run.sh", 22 | "fetch-se-gtfs": "DATASET=SE_GTFS FILE_NAME=se-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh", 23 | "test": "npm run lint && npm run check-deps" 24 | }, 25 | "dependencies": { 26 | "cheerio": "1.0.0-rc.12", 27 | "got": "^14.4.5", 28 | "lodash": "^4.17.21", 29 | "luxon": "^3.5.0" 30 | }, 31 | "devDependencies": { 32 | "depcheck": "^1.4.7", 33 | "eslint": "^8.57.1", 34 | "eslint-config-standard": "^17.1.0", 35 | "eslint-plugin-import": "^2.31.0", 36 | "eslint-plugin-n": "^16.6.2", 37 | "eslint-plugin-promise": "^6.6.0" 38 | }, 39 | "engines": { 40 | "node": ">=18" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # public-transport-data-scraper 2 | 3 | Scraper that re-publishes official public transport datasets under stable URLs, since many transportation authorities and government agencies sadly don't provide any on their own, making it nearly impossible to integrate these datasets in automated systems. 4 | 5 | ## Scraped datasets 6 | 7 | Dataset | License | Attribution | Stable URL 8 | ------- | ------- | ----------- | ---------- 9 | [Germany-wide GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten-gtfs&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/gtfs.zip) 10 | [Germany-wide NETEX feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/netex.zip) 11 | [German public transport stop registry (ZHV)](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-haltestellendaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/zhv.zip) 12 | [North Rhine-Westphalia (NRW) GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/bundeslaender/nrw/startseite?tx_vrrkit_view[dataset_name]=soll-fahrplandaten-nrw&tx_vrrkit_view[action]=details&tx_vrrkit_view[controller]=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/nrw-gtfs.zip) 13 | [Hamburger Verkehrsverbund (HVV) GTFS feed](https://suche.transparenz.hamburg.de/dataset?q=hvv%20gtfs&sort=score+desc%2Ctitle_sort+asc&esq_not_all_versions=true) (🇩🇪) | [DL-DE BY 2.0](https://www.govdata.de/dl-de/by-2-0) | [Hamburger Verkehrsverbund GmbH](https://www.hvv.de/) | [latest](https://scraped.data.public-transport.earth/de/hvv-gtfs.zip) 14 | [Luxembourg-wide GTFS feed](https://data.public.lu/en/datasets/horaires-et-arrets-des-transport-publics-gtfs/) (🇱🇺) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [Administration des transports publics](https://mmtp.gouvernement.lu/de/annuaire.html?idMin=7854) | [latest](https://scraped.data.public-transport.earth/lu/gtfs.zip) 15 | [Sweden-wide GTFS feed](https://www.trafiklab.se/api/trafiklab-apis/gtfs-sverige-2/) (🇸🇪) | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/deed.en) | [Trafiklab](https://www.trafiklab.se/) | [latest](https://scraped.data.public-transport.earth/se/gtfs.zip) 16 | 17 | ## Contributing 18 | 19 | If you found a bug, want to propose a feed or add a new scraper, feel free to visit [the issues page](https://github.com/juliuste/public-transport-data-scraper/issues), or open a pull request. 20 | -------------------------------------------------------------------------------- /src/fetch.js: -------------------------------------------------------------------------------- 1 | const dataset = process.env.DATASET 2 | 3 | const main = async () => { 4 | if (['DE_NETEX', 'DE_GTFS', 'DE_ZHV', 'DE_NRW_GTFS'].includes(dataset)) { 5 | const [user, password] = [process.env.OPENDATA_OEPNV_EMAIL, process.env.OPENDATA_OEPNV_PASSWORD] 6 | if (typeof user !== 'string' || user.length === 0) throw new Error('env.OPENDATA_OEPNV_EMAIL must be a non-empty string') 7 | if (typeof password !== 'string' || password.length === 0) throw new Error('env.OPENDATA_OEPNV_PASSWORD must be a non-empty string') 8 | 9 | if (dataset === 'DE_NETEX') { 10 | const { netex } = await import('./scrapers/opendata-oepnv.js') 11 | await netex(user, password) 12 | return 13 | } 14 | if (dataset === 'DE_GTFS') { 15 | const { gtfs } = await import('./scrapers/opendata-oepnv.js') 16 | await gtfs(user, password) 17 | return 18 | } 19 | if (dataset === 'DE_ZHV') { 20 | const { zhv } = await import('./scrapers/opendata-oepnv.js') 21 | await zhv(user, password) 22 | return 23 | } 24 | if (dataset === 'DE_NRW_GTFS') { 25 | const { nrwGtfs } = await import('./scrapers/opendata-oepnv.js') 26 | await nrwGtfs(user, password) 27 | return 28 | } 29 | } 30 | if (dataset === 'DE_HVV_GTFS') { 31 | const { hvvGtfs } = await import('./scrapers/hvv.js') 32 | await hvvGtfs() 33 | return 34 | } 35 | if (dataset === 'LU_GTFS') { 36 | const { luxembourgGtfs } = await import('./scrapers/luxembourg.js') 37 | await luxembourgGtfs() 38 | return 39 | } 40 | throw new Error(`unknown dataset: ${dataset}`) 41 | } 42 | 43 | main() 44 | .catch(error => { 45 | console.error(error) 46 | process.exit(1) 47 | }) 48 | -------------------------------------------------------------------------------- /src/linked-data/.bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /src/linked-data/zhv-de/index.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | DIR=$(dirname "$0") 4 | BASE_IRI="https://lod.codefor.de/" 5 | BIN_DIR="$DIR/../.bin" 6 | DATA_DIR="$DIR/data" 7 | 8 | echo 'Fetching rmlmapper' 9 | wget -q --show-progress --progress=dot:mega -c -N -O "$BIN_DIR/rmlmapper.jar" https://github.com/RMLio/rmlmapper-java/releases/download/v7.0.0/rmlmapper-7.0.0-r374-all.jar 10 | 11 | rm -rf "$DATA_DIR" 12 | mkdir -p "$DATA_DIR" 13 | 14 | echo 'Preparing data…' 15 | cp "$DIR/../../../de-zhv.zip" "$DATA_DIR/source.zip" 16 | unzip "$DATA_DIR/source.zip" -d "$DATA_DIR/unzipped" 17 | find "$DATA_DIR/unzipped/" -name "*.csv" -exec mv '{}' "$DATA_DIR/source.csv" \; 18 | 19 | pnpx csvtojson --delimiter=";" "$DATA_DIR/source.csv" > "$DATA_DIR/source.json" 20 | 21 | echo 'Applying mapping…' 22 | java -jar "$BIN_DIR/rmlmapper.jar" -m "$DIR/mapping.ttl" -s turtle --strict --base-iri "$BASE_IRI" > "$DATA_DIR/output.ttl" 23 | 24 | echo 'Compressing output…' 25 | cat "$DATA_DIR/output.ttl" | gzip > "$DATA_DIR/output.ttl.gz" 26 | 27 | # echo 'Done.' 28 | -------------------------------------------------------------------------------- /src/linked-data/zhv-de/mapping.ttl: -------------------------------------------------------------------------------- 1 | @prefix bahnhof: . 2 | @prefix codeforde: . 3 | @prefix dct: . 4 | @prefix dbo: . 5 | @prefix foaf: . 6 | @prefix geo: . 7 | @prefix gn: . 8 | @prefix juso: . 9 | @prefix netex: . 10 | @prefix owl: . 11 | @prefix ql: . 12 | @prefix rdf: . 13 | @prefix rdfs: . 14 | @prefix rml: . 15 | @prefix rr: . 16 | @prefix schema: . 17 | @prefix status: . 18 | @prefix wdt: . 19 | @prefix xsd: . 20 | 21 | # level 1 (stop place) 22 | 23 | [ a rr:TriplesMap ] 24 | rml:logicalSource [ 25 | rml:source "data/source.json"; 26 | rml:referenceFormulation ql:JSONPath; 27 | rml:iterator "$[?(@.Type == 'S')]" 28 | ]; 29 | rr:subjectMap [ 30 | rr:template "resource/by-key/ifopt/{.DHID}"; 31 | rr:class netex:StopPlace 32 | ]; 33 | rr:predicateObjectMap [ 34 | rr:predicate netex:topographicPlace; 35 | rr:objectMap [ 36 | rr:termType rr:IRI; 37 | rr:template "resource/by-key/ags/{.MunicipalityCode}" 38 | ] 39 | ], [ 40 | rr:predicate bahnhof:ifoptStopId; 41 | rr:objectMap [ 42 | rr:datatype xsd:string; 43 | rr:termType rr:Literal; 44 | rml:reference "$.DHID" 45 | ] 46 | ], [ 47 | rr:predicate netex:name; 48 | rr:objectMap [ 49 | rr:datatype xsd:string; 50 | rr:termType rr:Literal; 51 | rml:reference "$.Name" 52 | ] 53 | ]. 54 | # todo: other attributes 55 | 56 | # todo: level 2 (quay - platform), level 3 (quay - platform edge) 57 | -------------------------------------------------------------------------------- /src/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [ -z ${FILE_NAME+x} ]; then echo "missing env.FILE_NAME"; exit 1; fi 5 | if [ -z ${MINIMUM_SIZE_MB+x} ]; then echo "missing env.MINIMUM_SIZE_MB"; exit 1; fi 6 | 7 | DIRECTORY=$(dirname "$0") 8 | FILE_PATH="$DIRECTORY/../$FILE_NAME" 9 | MINIMUM_SIZE_BYTES=$((1024 * 1024 * $MINIMUM_SIZE_MB)) 10 | 11 | if [[ $DATASET == "SE_GTFS" ]]; then 12 | curl --output $FILE_PATH -L "https://api.resrobot.se/gtfs/sweden.zip?key=$TRAFIKLAB_API_KEY" 13 | else 14 | node $DIRECTORY/fetch.js > $FILE_PATH; 15 | fi 16 | 17 | if [ $(wc -c $FILE_PATH | awk '{print $1}') -lt $MINIMUM_SIZE_BYTES ]; then (echo 'Unexpected file size, seems to small.'; exit 1;) fi 18 | -------------------------------------------------------------------------------- /src/scrapers/hvv.js: -------------------------------------------------------------------------------- 1 | import got from 'got' 2 | 3 | export const hvvGtfs = async () => { 4 | const response = await got.get(new URL('https://suche.transparenz.hamburg.de/api/3/action/package_search?q=name%3Ahvv-fahrplandaten-gtfs-%2A-bis%2A&sort=metadata_modified%20desc')).json() 5 | const item = response?.result?.results[0] 6 | if (!item || item.author !== 'Hamburger Verkehrsverbund GmbH') throw new Error('no matching dataset found') 7 | if (item.license_id !== 'dl-de-by-2.0') throw new Error('unexpected license') 8 | 9 | if (item.resources.length !== 1) throw new Error('unexpected number of resources') 10 | const { url, created } = item.resources[0] 11 | if (!url || !created) throw new Error('missing resource properties') 12 | 13 | // throw if latest file is older than 31 days 14 | if (+new Date() - (+new Date(created)) > 31 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${created}`) 15 | 16 | const stream = await got.stream.get(url, { 17 | https: { rejectUnauthorized: false }, // sigh… 18 | }) 19 | return stream.pipe(process.stdout) 20 | } 21 | -------------------------------------------------------------------------------- /src/scrapers/luxembourg.js: -------------------------------------------------------------------------------- 1 | import got from 'got' 2 | import lodash from 'lodash' 3 | 4 | export const luxembourgGtfs = async () => { 5 | const response = await got.get(new URL('https://data.public.lu/api/1/datasets/gtfs')).json() 6 | if (response?.license !== 'cc-by') throw new Error('unexpected license') 7 | 8 | const [latest] = lodash.sortBy((response?.resources || []).filter(r => r?.format === 'zip'), r => -new Date(r?.published)) 9 | if (!latest) throw new Error('no matching dataset found') 10 | 11 | const { published, url } = latest 12 | if (!url || !published) throw new Error('missing resource properties') 13 | 14 | // throw if latest file is older than 20 days 15 | if (+new Date() - (+new Date(published)) > 20 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${published}`) 16 | 17 | const stream = await got.stream.get(url) 18 | return stream.pipe(process.stdout) 19 | } 20 | -------------------------------------------------------------------------------- /src/scrapers/opendata-oepnv.js: -------------------------------------------------------------------------------- 1 | // the data is fetched and processed in these steps: 2 | // 1. obtain a session cookie and the zip file's url 3 | // 2. download the data using the session cookie 4 | 5 | import { DateTime } from 'luxon' 6 | import cheerio from 'cheerio' 7 | import got from 'got' 8 | import lodash from 'lodash' 9 | import { basename } from 'path' 10 | 11 | const throwOnMultipleOrNoMatches = fileUrls => { 12 | if (fileUrls.length !== 1) throw new Error(`unexpected number of file urls: ${fileUrls.length}, probably internal error or invalid credentials.`) 13 | return fileUrls[0] 14 | } 15 | 16 | const extractUrlFromResponse = (html, isMatchingFile, selectBestMatch) => { 17 | const parsed = cheerio.load(html) 18 | const urls = Array.from(parsed('a').filter(function (i, el) { 19 | // this === el 20 | return isMatchingFile(parsed(this).attr('href') || '') 21 | })).map(a => parsed(a).attr('href')) 22 | const url = selectBestMatch(urls) 23 | return url 24 | } 25 | 26 | const fetchCookie = async (user, password) => { 27 | const url = new URL('https://www.opendata-oepnv.de/ht/de/willkommen') 28 | url.searchParams.append('tx_felogin_login[action]', 'login') 29 | url.searchParams.append('tx_felogin_login[controller]', 'Login') 30 | url.searchParams.append('cHash', '99c35a06ebc0db4f37f0bb93048bb79b') 31 | const response = await got.post(url, { 32 | form: { 33 | user, 34 | pass: password, 35 | submit: 'Anmelden', 36 | logintype: 'login', 37 | pid: '174@d6f42d5376399b9d6eee5cbcb5a06dcb1b489387', 38 | }, 39 | }) 40 | const cookie = (response.headers['set-cookie'] || []).find(c => c.includes('fe_typo_user')) 41 | if (!cookie) throw new Error('cookie not found. internal error or invalid credentials') 42 | return cookie 43 | } 44 | 45 | const fetchAndOutput = async (user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch = throwOnMultipleOrNoMatches) => { 46 | const cookie = await fetchCookie(user, password) 47 | 48 | const url = new URL(organisationPath, 'https://www.opendata-oepnv.de/ht/de/organisation/') 49 | url.searchParams.append('tx_vrrkit_view[dataset_name]', datasetName) 50 | url.searchParams.append('tx_vrrkit_view[action]', 'details') 51 | url.searchParams.append('tx_vrrkit_view[controller]', 'View') 52 | 53 | const response = await got.get(url, { headers: { Cookie: cookie } }) 54 | const fileUrl = extractUrlFromResponse(response.body, isMatchingFile, selectBestMatch) 55 | 56 | const stream = await got.stream.get(fileUrl, { headers: { Cookie: cookie } }) 57 | return stream.pipe(process.stdout) 58 | } 59 | 60 | export const gtfs = async (user, password) => { 61 | const organisationPath = 'delfi/startseite' 62 | const datasetName = 'deutschlandweite-sollfahrplandaten-gtfs' 63 | const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland_gtfs.zip') 64 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 65 | } 66 | 67 | export const netex = async (user, password) => { 68 | const organisationPath = 'delfi/startseite' 69 | const datasetName = 'deutschlandweite-sollfahrplandaten' 70 | const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland.zip') 71 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 72 | } 73 | 74 | export const zhv = async (user, password) => { 75 | const organisationPath = 'delfi/startseite' 76 | const datasetName = 'deutschlandweite-haltestellendaten' 77 | const isMatchingFile = name => name.endsWith('_zHV_gesamt.zip') 78 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 79 | } 80 | 81 | export const nrwGtfs = async (user, password) => { 82 | const organisationPath = 'bundeslaender/nrw/startseite' 83 | const datasetName = 'soll-fahrplandaten-nrw' 84 | const isMatchingFile = name => /\/\d{8}_gtfs_nrw\.zip$/.test(name) 85 | const selectBestMatch = urls => { 86 | const urlsWithDate = urls.map(url => { 87 | const fileName = basename(new URL(url).pathname) 88 | const date = DateTime.fromFormat(fileName.slice(0, 8), 'yyyyMMdd').toJSDate() 89 | return { date, url } 90 | }) 91 | const latest = lodash.last(lodash.sortBy(urlsWithDate, ({ date }) => +date)) 92 | // throw if latest file is older than 90 days 93 | if (+new Date() - (+latest.date) > 90 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${latest.date}`) 94 | return latest.url 95 | } 96 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch) 97 | } 98 | --------------------------------------------------------------------------------