├── .editorconfig
├── .eslintrc.json
├── .gitattributes
├── .github
    ├── funding.yml
    └── workflows
    │   ├── ci.yaml
    │   └── legacy-proxy.yml
├── .gitignore
├── .npmrc
├── legacy-proxy
    ├── Dockerfile
    ├── nginx.conf
    └── readme.md
├── license
├── package.json
├── readme.md
└── src
    ├── fetch.js
    ├── linked-data
        ├── .bin
        │   └── .gitignore
        └── zhv-de
        │   ├── index.sh
        │   └── mapping.ttl
    ├── run.sh
    └── scrapers
        ├── hvv.js
        ├── luxembourg.js
        └── opendata-oepnv.js


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = tab
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 
11 | [*.{yml,yaml}]
12 | indent_style = space
13 | indent_size = 2
14 | 


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "standard",
 3 |     "rules": {
 4 |         "comma-dangle": [
 5 |             "error",
 6 |             "always-multiline"
 7 |         ],
 8 |         "indent": ["error", "tab"],
 9 |         "no-tabs": "off"
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/.github/funding.yml:
--------------------------------------------------------------------------------
1 | github: [juliuste]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | on:
  3 |   push:
  4 |   pull_request:
  5 |   schedule:
  6 |     # keep request limits in mind before increasing the cron frequency
  7 |     # * is a special character in YAML so you have to quote this string
  8 |     - cron: '0 2 * * *'
  9 | jobs:
 10 |   test:
 11 |     runs-on: ubuntu-latest
 12 |     steps:
 13 |     - name: Checkout main
 14 |       uses: actions/checkout@v3
 15 |     - uses: pnpm/action-setup@v2
 16 |       name: Set up pnpm
 17 |       with:
 18 |         version: 7
 19 |     - name: Set up Node.js 18
 20 |       uses: actions/setup-node@v3
 21 |       with:
 22 |         node-version: 18
 23 |     - name: Install dependencies
 24 |       run: pnpm install
 25 |     - name: Run tests
 26 |       run: pnpm test
 27 |       env:
 28 |         CI: true
 29 | 
 30 |   fetch-and-publish:
 31 |     runs-on: ubuntu-latest
 32 |     needs: test
 33 |     if: github.ref == 'refs/heads/main'
 34 |     steps:
 35 |     - name: Checkout main
 36 |       uses: actions/checkout@v3
 37 |     - name: Set up Python 3
 38 |       uses: actions/setup-python@v4
 39 |       with:
 40 |         python-version: 3
 41 |     - name: Install awscli
 42 |       run: pip3 install awscli awscli-plugin-endpoint
 43 | 
 44 |     - name: Set up awscli configuration
 45 |       env:
 46 |         DO_SPACES_KEY: ${{ secrets.DO_SPACES_KEY }}
 47 |         DO_SPACES_SECRET: ${{ secrets.DO_SPACES_SECRET }}
 48 |       run: |
 49 |         set -e;
 50 |         mkdir ~/.aws;
 51 |         echo "
 52 |         [default]
 53 |         aws_access_key_id=$DO_SPACES_KEY
 54 |         aws_secret_access_key=$DO_SPACES_SECRET
 55 |         " > ~/.aws/credentials;
 56 |         echo "
 57 |         [plugins]
 58 |         endpoint = awscli_plugin_endpoint
 59 |         [default]
 60 |         region = fr-par
 61 |         s3 =
 62 |           endpoint_url = https://fra1.digitaloceanspaces.com
 63 |           signature_version = s3v4
 64 |           max_concurrent_requests = 100
 65 |           max_queue_size = 2000
 66 |           multipart_threshold = 2000MB
 67 |           # Edit the multipart_chunksize value according to the file sizes that you want to upload. The present configuration allows to upload files up to 10 GB (100 requests * 10MB). For example setting it to 5GB allows you to upload files up to 5TB.
 68 |           multipart_chunksize = 2000MB
 69 |         s3api =
 70 |           endpoint_url = https://fra1.digitaloceanspaces.com
 71 |         " > ~/.aws/config;
 72 | 
 73 |     - name: Ensure that bucket versioning is enabled
 74 |       env:
 75 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
 76 |       run: aws s3api put-bucket-versioning  --bucket $DO_SPACES_BUCKET_NAME --versioning-configuration 'Status=Enabled'
 77 | 
 78 |     - uses: pnpm/action-setup@v2
 79 |       name: Set up pnpm
 80 |       with:
 81 |         version: 7
 82 |     - name: Set up Node.js 18
 83 |       uses: actions/setup-node@v3
 84 |       with:
 85 |         node-version: 18
 86 |     - name: Install dependencies
 87 |       run: pnpm install
 88 | 
 89 |     - name: "Fetch and upload feed: DE_GTFS"
 90 |       if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing
 91 |       env:
 92 |         OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }}
 93 |         OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }}
 94 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
 95 |       run: |
 96 |         set -e;
 97 |         npm run fetch-de-gtfs;
 98 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/gtfs.zip || echo 'not-yet-existing');
 99 |         newhash=$(cat de-gtfs.zip | md5sum);
100 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
101 |         then
102 |           aws s3 cp --acl public-read de-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/gtfs.zip
103 |         else
104 |           echo 'file unchanged, skipping.'
105 |         fi;
106 | 
107 |     - name: "Fetch and upload feed: DE_NETEX"
108 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
109 |       env:
110 |         OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }}
111 |         OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }}
112 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
113 |       run: |
114 |         set -e;
115 |         npm run fetch-de-netex;
116 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/netex.zip || echo 'not-yet-existing');
117 |         newhash=$(cat de-netex.zip | md5sum);
118 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
119 |         then
120 |           aws s3 cp --acl public-read de-netex.zip s3://"$DO_SPACES_BUCKET_NAME"/de/netex.zip
121 |         else
122 |           echo 'file unchanged, skipping.'
123 |         fi;
124 | 
125 |     - name: "Fetch and upload feed: DE_ZHV"
126 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
127 |       env:
128 |         OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }}
129 |         OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }}
130 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
131 |       run: |
132 |         set -e;
133 |         npm run fetch-de-zhv;
134 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/zhv.zip || echo 'not-yet-existing');
135 |         newhash=$(cat de-zhv.zip | md5sum);
136 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
137 |         then
138 |           aws s3 cp --acl public-read de-zhv.zip s3://"$DO_SPACES_BUCKET_NAME"/de/zhv.zip
139 |         else
140 |           echo 'file unchanged, skipping.'
141 |         fi;
142 | 
143 |     - name: "Generate linked data: DE_ZHV"
144 |       # todo: do not run if processing the dataset failed
145 |       if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing
146 |       env:
147 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
148 |       run: |
149 |         set -e;
150 |         ./src/linked-data/zhv-de/index.sh
151 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/zhv.ttl.gz || echo 'not-yet-existing');
152 |         newhash=$(cat ./src/linked-data/zhv-de/data/output.ttl.gz | md5sum);
153 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
154 |         then
155 |           aws s3 cp --acl public-read ./src/linked-data/zhv-de/data/output.ttl.gz s3://"$DO_SPACES_BUCKET_NAME"/de/zhv.ttl.gz
156 |         else
157 |           echo 'file unchanged, skipping.'
158 |         fi;
159 | 
160 |     - name: "Fetch and upload feed: DE_NRW_GTFS"
161 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
162 |       env:
163 |         OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }}
164 |         OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }}
165 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
166 |       run: |
167 |         set -e;
168 |         npm run fetch-de-nrw-gtfs;
169 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/nrw-gtfs.zip || echo 'not-yet-existing');
170 |         newhash=$(cat de-nrw-gtfs.zip | md5sum);
171 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
172 |         then
173 |           aws s3 cp --acl public-read de-nrw-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/nrw-gtfs.zip
174 |         else
175 |           echo 'file unchanged, skipping.'
176 |         fi;
177 | 
178 |     - name: "Fetch and upload feed: DE_HVV_GTFS"
179 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
180 |       env:
181 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
182 |       run: |
183 |         set -e;
184 |         npm run fetch-de-hvv-gtfs;
185 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key de/hvv-gtfs.zip || echo 'not-yet-existing');
186 |         newhash=$(cat de-hvv-gtfs.zip | md5sum);
187 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
188 |         then
189 |           aws s3 cp --acl public-read de-hvv-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/de/hvv-gtfs.zip
190 |         else
191 |           echo 'file unchanged, skipping.'
192 |         fi;
193 | 
194 |     - name: "Fetch and upload feed: LU_GTFS"
195 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
196 |       env:
197 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
198 |       run: |
199 |         set -e;
200 |         npm run fetch-lu-gtfs;
201 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key lu/gtfs.zip || echo 'not-yet-existing');
202 |         newhash=$(cat lu-gtfs.zip | md5sum);
203 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
204 |         then
205 |           aws s3 cp --acl public-read lu-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/lu/gtfs.zip
206 |         else
207 |           echo 'file unchanged, skipping.'
208 |         fi;
209 | 
210 |     - name: "Fetch and upload feed: SE_GTFS"
211 |       if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing
212 |       env:
213 |         TRAFIKLAB_API_KEY: ${{ secrets.TRAFIKLAB_API_KEY }}
214 |         DO_SPACES_BUCKET_NAME: ${{ secrets.DO_SPACES_BUCKET_NAME }}
215 |       run: |
216 |         set -e;
217 |         npm run fetch-se-gtfs;
218 |         currentobj=$(aws s3api head-object --bucket $DO_SPACES_BUCKET_NAME --key se/gtfs.zip || echo 'not-yet-existing');
219 |         newhash=$(cat se-gtfs.zip | md5sum);
220 |         if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ]
221 |         then
222 |           aws s3 cp --acl public-read se-gtfs.zip s3://"$DO_SPACES_BUCKET_NAME"/se/gtfs.zip
223 |         else
224 |           echo 'file unchanged, skipping.'
225 |         fi;
226 | 


--------------------------------------------------------------------------------
/.github/workflows/legacy-proxy.yml:
--------------------------------------------------------------------------------
 1 | name: Legacy proxy
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | jobs:
 7 |   build-and-push:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v4
11 |     - name: Set up QEMU
12 |       uses: docker/setup-qemu-action@v3
13 |     - name: Set up Docker Buildx
14 |       uses: docker/setup-buildx-action@v3
15 |     - name: Login to GitHub Container Registry
16 |       uses: docker/login-action@v3
17 |       with:
18 |         registry: ghcr.io
19 |         username: ${{ github.repository_owner }}
20 |         password: ${{ secrets.GITHUB_TOKEN }}
21 |     - name: Fetch commit hash
22 |       id: hash
23 |       run: echo "::set-output name=hash::$(echo $GITHUB_SHA | head -c7)"
24 |     - name: Fetch current date and time
25 |       id: datetime
26 |       run: echo "::set-output name=datetime::$(date -u +'%Y-%m-%dT%H.%M.%SZ')"
27 |     - name: Build and push
28 |       uses: docker/build-push-action@v3
29 |       with:
30 |         file: ./legacy-proxy/Dockerfile
31 |         platforms: linux/amd64,linux/arm64
32 |         push: true
33 |         tags: ghcr.io/${{github.repository}}-legacy-proxy:v1_${{steps.hash.outputs.hash}}_${{steps.datetime.outputs.datetime}}
34 |         cache-from: type=gha
35 |         cache-to: type=gha,mode=max
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # general
 2 | .DS_Store
 3 | *.log
 4 | 
 5 | # node-specific
 6 | node_modules
 7 | package-lock.json
 8 | yarn.lock
 9 | shrinkwrap.yaml
10 | pnpm-lock.yaml
11 | dist
12 | 
13 | data
14 | *.zip
15 | 


--------------------------------------------------------------------------------
/.npmrc:
--------------------------------------------------------------------------------
1 | package-lock=false
2 | 


--------------------------------------------------------------------------------
/legacy-proxy/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx:alpine
2 | 
3 | WORKDIR /app
4 | 
5 | COPY legacy-proxy/nginx.conf /etc/nginx/conf.d/default.conf
6 | 


--------------------------------------------------------------------------------
/legacy-proxy/nginx.conf:
--------------------------------------------------------------------------------
1 | server {
2 |   listen 3000;
3 |   rewrite /gtfs-germany.zip https://scraped.data.public-transport.earth/de/gtfs.zip permanent;
4 |   rewrite /netex-germany.zip https://scraped.data.public-transport.earth/de/netex.zip permanent;
5 |   rewrite /zhv.zip https://scraped.data.public-transport.earth/de/zhv.zip permanent;
6 | }
7 | 


--------------------------------------------------------------------------------
/legacy-proxy/readme.md:
--------------------------------------------------------------------------------
1 | # Legacy proxy
2 | 
3 | The docker image defined in this directory is deployed to our [shared infrastructure](https://github.com/public-transport/infrastructure) and exposes a legacy proxy which forwards all requests to old dataset URLs to their latest equivalent.
4 | 
5 | This service only exists for backwards compatibility and might be shut down in the future, users should always use the latest endpoints in their projects.
6 | 


--------------------------------------------------------------------------------
/license:
--------------------------------------------------------------------------------
 1 | # Data
 2 | 
 3 | Data provided under separate licenses, refer to readme.md for a full list.
 4 | 
 5 | # Code
 6 | 
 7 | Copyright (c) 2023, Julius Tens
 8 | 
 9 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
12 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "public-transport-data-scraper",
 3 | 	"version": "0.0.0",
 4 | 	"private": true,
 5 | 	"description": "Scraper that re-publishes official german public transport datasets with stable URLs.",
 6 | 	"homepage": "https://github.com/juliuste/public-transport-data-scraper",
 7 | 	"bugs": "https://github.com/juliuste/public-transport-data-scraper/issues",
 8 | 	"repository": "juliuste/public-transport-data-scraper",
 9 | 	"license": "ISC",
10 | 	"author": "Julius Tens <mail@juliustens.eu>",
11 | 	"type": "module",
12 | 	"scripts": {
13 | 		"check-deps": "depcheck",
14 | 		"fix": "npm run lint -- --fix",
15 | 		"lint": "eslint src",
16 | 		"fetch-de-gtfs": "DATASET=DE_GTFS FILE_NAME=de-gtfs.zip MINIMUM_SIZE_MB=100 ./src/run.sh",
17 | 		"fetch-de-hvv-gtfs": "DATASET=DE_HVV_GTFS FILE_NAME=de-hvv-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh",
18 | 		"fetch-de-netex": "DATASET=DE_NETEX FILE_NAME=de-netex.zip MINIMUM_SIZE_MB=500 ./src/run.sh",
19 | 		"fetch-de-nrw-gtfs": "DATASET=DE_NRW_GTFS FILE_NAME=de-nrw-gtfs.zip MINIMUM_SIZE_MB=30 ./src/run.sh",
20 | 		"fetch-de-zhv": "DATASET=DE_ZHV FILE_NAME=de-zhv.zip MINIMUM_SIZE_MB=10 ./src/run.sh",
21 | 		"fetch-lu-gtfs": "DATASET=LU_GTFS FILE_NAME=lu-gtfs.zip MINIMUM_SIZE_MB=3 ./src/run.sh",
22 | 		"fetch-se-gtfs": "DATASET=SE_GTFS FILE_NAME=se-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh",
23 | 		"test": "npm run lint && npm run check-deps"
24 | 	},
25 | 	"dependencies": {
26 | 		"cheerio": "1.0.0-rc.12",
27 | 		"got": "^14.4.5",
28 | 		"lodash": "^4.17.21",
29 | 		"luxon": "^3.5.0"
30 | 	},
31 | 	"devDependencies": {
32 | 		"depcheck": "^1.4.7",
33 | 		"eslint": "^8.57.1",
34 | 		"eslint-config-standard": "^17.1.0",
35 | 		"eslint-plugin-import": "^2.31.0",
36 | 		"eslint-plugin-n": "^16.6.2",
37 | 		"eslint-plugin-promise": "^6.6.0"
38 | 	},
39 | 	"engines": {
40 | 		"node": ">=18"
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # public-transport-data-scraper
 2 | 
 3 | Scraper that re-publishes official public transport datasets under stable URLs, since many transportation authorities and government agencies sadly don't provide any on their own, making it nearly impossible to integrate these datasets in automated systems.
 4 | 
 5 | ## Scraped datasets
 6 | 
 7 | Dataset | License | Attribution | Stable URL
 8 | ------- | ------- | ----------- | ----------
 9 | [Germany-wide GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten-gtfs&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/gtfs.zip)
10 | [Germany-wide NETEX feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/netex.zip)
11 | [German public transport stop registry (ZHV)](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-haltestellendaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/zhv.zip)
12 | [North Rhine-Westphalia (NRW) GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/bundeslaender/nrw/startseite?tx_vrrkit_view[dataset_name]=soll-fahrplandaten-nrw&tx_vrrkit_view[action]=details&tx_vrrkit_view[controller]=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/nrw-gtfs.zip)
13 | [Hamburger Verkehrsverbund (HVV) GTFS feed](https://suche.transparenz.hamburg.de/dataset?q=hvv%20gtfs&sort=score+desc%2Ctitle_sort+asc&esq_not_all_versions=true) (🇩🇪) | [DL-DE BY 2.0](https://www.govdata.de/dl-de/by-2-0) | [Hamburger Verkehrsverbund GmbH](https://www.hvv.de/) | [latest](https://scraped.data.public-transport.earth/de/hvv-gtfs.zip)
14 | [Luxembourg-wide GTFS feed](https://data.public.lu/en/datasets/horaires-et-arrets-des-transport-publics-gtfs/) (🇱🇺) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.de) | [Administration des transports publics](https://mmtp.gouvernement.lu/de/annuaire.html?idMin=7854) | [latest](https://scraped.data.public-transport.earth/lu/gtfs.zip)
15 | [Sweden-wide GTFS feed](https://www.trafiklab.se/api/trafiklab-apis/gtfs-sverige-2/) (🇸🇪) | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/deed.en) | [Trafiklab](https://www.trafiklab.se/) | [latest](https://scraped.data.public-transport.earth/se/gtfs.zip)
16 | 
17 | ## Contributing
18 | 
19 | If you found a bug, want to propose a feed or add a new scraper, feel free to visit [the issues page](https://github.com/juliuste/public-transport-data-scraper/issues), or open a pull request.
20 | 


--------------------------------------------------------------------------------
/src/fetch.js:
--------------------------------------------------------------------------------
 1 | const dataset = process.env.DATASET
 2 | 
 3 | const main = async () => {
 4 | 	if (['DE_NETEX', 'DE_GTFS', 'DE_ZHV', 'DE_NRW_GTFS'].includes(dataset)) {
 5 | 		const [user, password] = [process.env.OPENDATA_OEPNV_EMAIL, process.env.OPENDATA_OEPNV_PASSWORD]
 6 | 		if (typeof user !== 'string' || user.length === 0) throw new Error('env.OPENDATA_OEPNV_EMAIL must be a non-empty string')
 7 | 		if (typeof password !== 'string' || password.length === 0) throw new Error('env.OPENDATA_OEPNV_PASSWORD must be a non-empty string')
 8 | 
 9 | 		if (dataset === 'DE_NETEX') {
10 | 			const { netex } = await import('./scrapers/opendata-oepnv.js')
11 | 			await netex(user, password)
12 | 			return
13 | 		}
14 | 		if (dataset === 'DE_GTFS') {
15 | 			const { gtfs } = await import('./scrapers/opendata-oepnv.js')
16 | 			await gtfs(user, password)
17 | 			return
18 | 		}
19 | 		if (dataset === 'DE_ZHV') {
20 | 			const { zhv } = await import('./scrapers/opendata-oepnv.js')
21 | 			await zhv(user, password)
22 | 			return
23 | 		}
24 | 		if (dataset === 'DE_NRW_GTFS') {
25 | 			const { nrwGtfs } = await import('./scrapers/opendata-oepnv.js')
26 | 			await nrwGtfs(user, password)
27 | 			return
28 | 		}
29 | 	}
30 | 	if (dataset === 'DE_HVV_GTFS') {
31 | 		const { hvvGtfs } = await import('./scrapers/hvv.js')
32 | 		await hvvGtfs()
33 | 		return
34 | 	}
35 | 	if (dataset === 'LU_GTFS') {
36 | 		const { luxembourgGtfs } = await import('./scrapers/luxembourg.js')
37 | 		await luxembourgGtfs()
38 | 		return
39 | 	}
40 | 	throw new Error(`unknown dataset: ${dataset}`)
41 | }
42 | 
43 | main()
44 | 	.catch(error => {
45 | 		console.error(error)
46 | 		process.exit(1)
47 | 	})
48 | 


--------------------------------------------------------------------------------
/src/linked-data/.bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/src/linked-data/zhv-de/index.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | DIR=$(dirname "$0")
 4 | BASE_IRI="https://lod.codefor.de/"
 5 | BIN_DIR="$DIR/../.bin"
 6 | DATA_DIR="$DIR/data"
 7 | 
 8 | echo 'Fetching rmlmapper'
 9 | wget -q --show-progress --progress=dot:mega -c -N -O "$BIN_DIR/rmlmapper.jar" https://github.com/RMLio/rmlmapper-java/releases/download/v7.0.0/rmlmapper-7.0.0-r374-all.jar
10 | 
11 | rm -rf "$DATA_DIR"
12 | mkdir -p "$DATA_DIR"
13 | 
14 | echo 'Preparing data…'
15 | cp "$DIR/../../../de-zhv.zip" "$DATA_DIR/source.zip"
16 | unzip "$DATA_DIR/source.zip" -d "$DATA_DIR/unzipped"
17 | find "$DATA_DIR/unzipped/" -name "*.csv" -exec mv '{}' "$DATA_DIR/source.csv" \;
18 | 
19 | pnpx csvtojson --delimiter=";" "$DATA_DIR/source.csv" > "$DATA_DIR/source.json"
20 | 
21 | echo 'Applying mapping…'
22 | java -jar "$BIN_DIR/rmlmapper.jar" -m "$DIR/mapping.ttl" -s turtle --strict --base-iri "$BASE_IRI" > "$DATA_DIR/output.ttl"
23 | 
24 | echo 'Compressing output…'
25 | cat "$DATA_DIR/output.ttl" | gzip > "$DATA_DIR/output.ttl.gz"
26 | 
27 | # echo 'Done.'
28 | 


--------------------------------------------------------------------------------
/src/linked-data/zhv-de/mapping.ttl:
--------------------------------------------------------------------------------
 1 | @prefix bahnhof: <https://lod.bahnhof.de/vocabulary/db/>.
 2 | @prefix codeforde: <https://lod.codefor.de/vocabulary/codeforde/>.
 3 | @prefix dct: <http://purl.org/dc/terms/>.
 4 | @prefix dbo: <https://dbpedia.org/ontology/>.
 5 | @prefix foaf: <http://xmlns.com/foaf/0.1/>.
 6 | @prefix geo: <http://www.opengis.net/ont/geosparql#>.
 7 | @prefix gn: <http://www.geonames.org/ontology#>.
 8 | @prefix juso: <http://rdfs.co/juso/>.
 9 | @prefix netex: <https://lod.bahnhof.de/vocabulary/netex/>.
10 | @prefix owl: <http://www.w3.org/2002/07/owl#>.
11 | @prefix ql: <http://semweb.mmlab.be/ns/ql#>.
12 | @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
13 | @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
14 | @prefix rml: <http://semweb.mmlab.be/ns/rml#>.
15 | @prefix rr: <http://www.w3.org/ns/r2rml#>.
16 | @prefix schema: <http://schema.org/>.
17 | @prefix status: <http://www.w3.org/2003/06/sw-vocab-status/ns#>.
18 | @prefix wdt: <http://www.wikidata.org/prop/direct/>.
19 | @prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
20 | 
21 | # level 1 (stop place)
22 | 
23 | [ a rr:TriplesMap ]
24 |   rml:logicalSource [
25 |     rml:source "data/source.json";
26 |     rml:referenceFormulation ql:JSONPath;
27 |     rml:iterator "$[?(@.Type == 'S')]"
28 |   ];
29 |   rr:subjectMap [
30 |     rr:template "resource/by-key/ifopt/{.DHID}";
31 |     rr:class netex:StopPlace
32 |   ];
33 | 	rr:predicateObjectMap [
34 |     rr:predicate netex:topographicPlace;
35 |     rr:objectMap [
36 |       rr:termType rr:IRI;
37 |       rr:template "resource/by-key/ags/{.MunicipalityCode}"
38 |     ]
39 |   ], [
40 |     rr:predicate bahnhof:ifoptStopId;
41 |     rr:objectMap [
42 |       rr:datatype xsd:string;
43 |       rr:termType rr:Literal;
44 |       rml:reference "$.DHID"
45 |     ]
46 |   ], [
47 | 		rr:predicate netex:name;
48 |     rr:objectMap [
49 |       rr:datatype xsd:string;
50 |       rr:termType rr:Literal;
51 |       rml:reference "$.Name"
52 |     ]
53 | 	].
54 | 	# todo: other attributes
55 | 
56 | # todo: level 2 (quay - platform), level 3 (quay - platform edge)
57 | 


--------------------------------------------------------------------------------
/src/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | if [ -z ${FILE_NAME+x} ]; then echo "missing env.FILE_NAME"; exit 1; fi
 5 | if [ -z ${MINIMUM_SIZE_MB+x} ]; then echo "missing env.MINIMUM_SIZE_MB"; exit 1; fi
 6 | 
 7 | DIRECTORY=$(dirname "$0")
 8 | FILE_PATH="$DIRECTORY/../$FILE_NAME"
 9 | MINIMUM_SIZE_BYTES=$((1024 * 1024 * $MINIMUM_SIZE_MB))
10 | 
11 | if [[ $DATASET == "SE_GTFS" ]]; then
12 | 	curl --output $FILE_PATH -L "https://api.resrobot.se/gtfs/sweden.zip?key=$TRAFIKLAB_API_KEY"
13 | else
14 | 	node $DIRECTORY/fetch.js > $FILE_PATH;
15 | fi
16 | 
17 | if [ $(wc -c $FILE_PATH | awk '{print $1}') -lt $MINIMUM_SIZE_BYTES ]; then (echo 'Unexpected file size, seems to small.'; exit 1;) fi
18 | 


--------------------------------------------------------------------------------
/src/scrapers/hvv.js:
--------------------------------------------------------------------------------
 1 | import got from 'got'
 2 | 
 3 | export const hvvGtfs = async () => {
 4 | 	const response = await got.get(new URL('https://suche.transparenz.hamburg.de/api/3/action/package_search?q=name%3Ahvv-fahrplandaten-gtfs-%2A-bis%2A&sort=metadata_modified%20desc')).json()
 5 | 	const item = response?.result?.results[0]
 6 | 	if (!item || item.author !== 'Hamburger Verkehrsverbund GmbH') throw new Error('no matching dataset found')
 7 | 	if (item.license_id !== 'dl-de-by-2.0') throw new Error('unexpected license')
 8 | 
 9 | 	if (item.resources.length !== 1) throw new Error('unexpected number of resources')
10 | 	const { url, created } = item.resources[0]
11 | 	if (!url || !created) throw new Error('missing resource properties')
12 | 
13 | 	// throw if latest file is older than 31 days
14 | 	if (+new Date() - (+new Date(created)) > 31 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${created}`)
15 | 
16 | 	const stream = await got.stream.get(url, {
17 | 		https: { rejectUnauthorized: false }, // sigh…
18 | 	})
19 | 	return stream.pipe(process.stdout)
20 | }
21 | 


--------------------------------------------------------------------------------
/src/scrapers/luxembourg.js:
--------------------------------------------------------------------------------
 1 | import got from 'got'
 2 | import lodash from 'lodash'
 3 | 
 4 | export const luxembourgGtfs = async () => {
 5 | 	const response = await got.get(new URL('https://data.public.lu/api/1/datasets/gtfs')).json()
 6 | 	if (response?.license !== 'cc-by') throw new Error('unexpected license')
 7 | 
 8 | 	const [latest] = lodash.sortBy((response?.resources || []).filter(r => r?.format === 'zip'), r => -new Date(r?.published))
 9 | 	if (!latest) throw new Error('no matching dataset found')
10 | 
11 | 	const { published, url } = latest
12 | 	if (!url || !published) throw new Error('missing resource properties')
13 | 
14 | 	// throw if latest file is older than 20 days
15 | 	if (+new Date() - (+new Date(published)) > 20 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${published}`)
16 | 
17 | 	const stream = await got.stream.get(url)
18 | 	return stream.pipe(process.stdout)
19 | }
20 | 


--------------------------------------------------------------------------------
/src/scrapers/opendata-oepnv.js:
--------------------------------------------------------------------------------
 1 | // the data is fetched and processed in these steps:
 2 | // 1. obtain a session cookie and the zip file's url
 3 | // 2. download the data using the session cookie
 4 | 
 5 | import { DateTime } from 'luxon'
 6 | import cheerio from 'cheerio'
 7 | import got from 'got'
 8 | import lodash from 'lodash'
 9 | import { basename } from 'path'
10 | 
11 | const throwOnMultipleOrNoMatches = fileUrls => {
12 | 	if (fileUrls.length !== 1) throw new Error(`unexpected number of file urls: ${fileUrls.length}, probably internal error or invalid credentials.`)
13 | 	return fileUrls[0]
14 | }
15 | 
16 | const extractUrlFromResponse = (html, isMatchingFile, selectBestMatch) => {
17 | 	const parsed = cheerio.load(html)
18 | 	const urls = Array.from(parsed('a').filter(function (i, el) {
19 | 		// this === el
20 | 		return isMatchingFile(parsed(this).attr('href') || '')
21 | 	})).map(a => parsed(a).attr('href'))
22 | 	const url = selectBestMatch(urls)
23 | 	return url
24 | }
25 | 
26 | const fetchCookie = async (user, password) => {
27 | 	const url = new URL('https://www.opendata-oepnv.de/ht/de/willkommen')
28 | 	url.searchParams.append('tx_felogin_login[action]', 'login')
29 | 	url.searchParams.append('tx_felogin_login[controller]', 'Login')
30 | 	url.searchParams.append('cHash', '99c35a06ebc0db4f37f0bb93048bb79b')
31 | 	const response = await got.post(url, {
32 | 		form: {
33 | 			user,
34 | 			pass: password,
35 | 			submit: 'Anmelden',
36 | 			logintype: 'login',
37 | 			pid: '174@d6f42d5376399b9d6eee5cbcb5a06dcb1b489387',
38 | 		},
39 | 	})
40 | 	const cookie = (response.headers['set-cookie'] || []).find(c => c.includes('fe_typo_user'))
41 | 	if (!cookie) throw new Error('cookie not found. internal error or invalid credentials')
42 | 	return cookie
43 | }
44 | 
45 | const fetchAndOutput = async (user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch = throwOnMultipleOrNoMatches) => {
46 | 	const cookie = await fetchCookie(user, password)
47 | 
48 | 	const url = new URL(organisationPath, 'https://www.opendata-oepnv.de/ht/de/organisation/')
49 | 	url.searchParams.append('tx_vrrkit_view[dataset_name]', datasetName)
50 | 	url.searchParams.append('tx_vrrkit_view[action]', 'details')
51 | 	url.searchParams.append('tx_vrrkit_view[controller]', 'View')
52 | 
53 | 	const response = await got.get(url, { headers: { Cookie: cookie } })
54 | 	const fileUrl = extractUrlFromResponse(response.body, isMatchingFile, selectBestMatch)
55 | 
56 | 	const stream = await got.stream.get(fileUrl, { headers: { Cookie: cookie } })
57 | 	return stream.pipe(process.stdout)
58 | }
59 | 
60 | export const gtfs = async (user, password) => {
61 | 	const organisationPath = 'delfi/startseite'
62 | 	const datasetName = 'deutschlandweite-sollfahrplandaten-gtfs'
63 | 	const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland_gtfs.zip')
64 | 	await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile)
65 | }
66 | 
67 | export const netex = async (user, password) => {
68 | 	const organisationPath = 'delfi/startseite'
69 | 	const datasetName = 'deutschlandweite-sollfahrplandaten'
70 | 	const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland.zip')
71 | 	await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile)
72 | }
73 | 
74 | export const zhv = async (user, password) => {
75 | 	const organisationPath = 'delfi/startseite'
76 | 	const datasetName = 'deutschlandweite-haltestellendaten'
77 | 	const isMatchingFile = name => name.endsWith('_zHV_gesamt.zip')
78 | 	await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile)
79 | }
80 | 
81 | export const nrwGtfs = async (user, password) => {
82 | 	const organisationPath = 'bundeslaender/nrw/startseite'
83 | 	const datasetName = 'soll-fahrplandaten-nrw'
84 | 	const isMatchingFile = name => /\/\d{8}_gtfs_nrw\.zip$/.test(name)
85 | 	const selectBestMatch = urls => {
86 | 		const urlsWithDate = urls.map(url => {
87 | 			const fileName = basename(new URL(url).pathname)
88 | 			const date = DateTime.fromFormat(fileName.slice(0, 8), 'yyyyMMdd').toJSDate()
89 | 			return { date, url }
90 | 		})
91 | 		const latest = lodash.last(lodash.sortBy(urlsWithDate, ({ date }) => +date))
92 | 		// throw if latest file is older than 90 days
93 | 		if (+new Date() - (+latest.date) > 90 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${latest.date}`)
94 | 		return latest.url
95 | 	}
96 | 	await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch)
97 | }
98 | 


--------------------------------------------------------------------------------