├── .npmrc ├── .gitattributes ├── .github ├── funding.yml └── workflows │ ├── legacy-proxy.yml │ └── ci.yaml ├── src ├── linked-data │ ├── .bin │ │ └── .gitignore │ └── zhv-de │ │ ├── convert.js │ │ ├── index.sh │ │ └── mapping.ttl ├── run.sh ├── scrapers │ ├── luxembourg.js │ └── opendata-oepnv.js └── fetch.js ├── legacy-proxy ├── Dockerfile ├── nginx.conf └── readme.md ├── .gitignore ├── .eslintrc.json ├── .editorconfig ├── license ├── package.json └── readme.md /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | github: [juliuste] 2 | -------------------------------------------------------------------------------- /src/linked-data/.bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /legacy-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:alpine 2 | 3 | WORKDIR /app 4 | 5 | COPY legacy-proxy/nginx.conf /etc/nginx/conf.d/default.conf 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # general 2 | .DS_Store 3 | *.log 4 | 5 | # node-specific 6 | node_modules 7 | package-lock.json 8 | yarn.lock 9 | shrinkwrap.yaml 10 | pnpm-lock.yaml 11 | dist 12 | 13 | data 14 | *.zip 15 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "standard", 3 | "rules": { 4 | "comma-dangle": [ 5 | "error", 6 | "always-multiline" 7 | ], 8 | "indent": ["error", "tab"], 9 | "no-tabs": "off" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = tab 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.{yml,yaml}] 12 | indent_style = space 13 | indent_size = 2 14 | -------------------------------------------------------------------------------- /legacy-proxy/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 3000; 3 | rewrite /gtfs-germany.zip https://scraped.data.public-transport.earth/de/gtfs.zip permanent; 4 | rewrite /netex-germany.zip https://scraped.data.public-transport.earth/de/netex.zip permanent; 5 | rewrite /zhv.zip https://scraped.data.public-transport.earth/de/zhv.zip permanent; 6 | } 7 | -------------------------------------------------------------------------------- /legacy-proxy/readme.md: -------------------------------------------------------------------------------- 1 | # Legacy proxy 2 | 3 | The docker image defined in this directory is deployed to our [shared infrastructure](https://github.com/public-transport/infrastructure) and exposes a legacy proxy which forwards all requests to old dataset URLs to their latest equivalent. 4 | 5 | This service only exists for backwards compatibility and might be shut down in the future, users should always use the latest endpoints in their projects. 6 | -------------------------------------------------------------------------------- /src/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [ -z ${FILE_NAME+x} ]; then echo "missing env.FILE_NAME"; exit 1; fi 5 | if [ -z ${MINIMUM_SIZE_MB+x} ]; then echo "missing env.MINIMUM_SIZE_MB"; exit 1; fi 6 | 7 | DIRECTORY=$(dirname "$0") 8 | FILE_PATH="$DIRECTORY/../$FILE_NAME" 9 | MINIMUM_SIZE_BYTES=$((1024 * 1024 * $MINIMUM_SIZE_MB)) 10 | 11 | if [[ $DATASET == "SE_GTFS" ]]; then 12 | curl --output $FILE_PATH -L "https://api.resrobot.se/gtfs/sweden.zip?key=$TRAFIKLAB_API_KEY" 13 | else 14 | node $DIRECTORY/fetch.js > $FILE_PATH; 15 | fi 16 | 17 | if [ $(wc -c $FILE_PATH | awk '{print $1}') -lt $MINIMUM_SIZE_BYTES ]; then (echo 'Unexpected file size, seems to small.'; exit 1;) fi 18 | -------------------------------------------------------------------------------- /src/linked-data/zhv-de/convert.js: -------------------------------------------------------------------------------- 1 | import { resolve } from 'path' 2 | import { loadJsonFile } from 'load-json-file' 3 | 4 | const dirname = import.meta.dirname 5 | 6 | const source = await loadJsonFile(resolve(dirname, './data/source.json')) 7 | const ags2ars = await loadJsonFile(resolve(dirname, './data/ags2ars.json')) 8 | 9 | const output = source.map(s => { 10 | const { MunicipalityCode } = s 11 | if (!MunicipalityCode) return s 12 | if (+MunicipalityCode === 0) return s 13 | const matchingArs = ags2ars[MunicipalityCode] 14 | if (!matchingArs) { 15 | console.error(`no ars found for municipality code: ${MunicipalityCode}`) 16 | return s 17 | } 18 | return { 19 | ...s, 20 | arsPadded: matchingArs.padEnd(12, '0'), 21 | } 22 | }) 23 | 24 | process.stdout.write(JSON.stringify(output, null, 2)) 25 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | Data provided under separate licenses, refer to readme.md for a full list. 4 | 5 | # Code 6 | 7 | Copyright (c) 2023, Julius Tens 8 | 9 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 12 | -------------------------------------------------------------------------------- /src/linked-data/zhv-de/index.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | DIR=$(dirname "$0") 4 | BASE_IRI="https://lod.codefor.de/" 5 | BIN_DIR="$DIR/../.bin" 6 | DATA_DIR="$DIR/data" 7 | 8 | echo 'Fetching rmlmapper' 9 | wget -q --show-progress --progress=dot:mega -c -N -O "$BIN_DIR/rmlmapper.jar" https://github.com/RMLio/rmlmapper-java/releases/download/v7.0.0/rmlmapper-7.0.0-r374-all.jar 10 | 11 | rm -rf "$DATA_DIR" 12 | mkdir -p "$DATA_DIR" 13 | 14 | echo 'Fetching AGS mapping…' 15 | curl 'https://scraped.data.juliustens.eu/vg250-ew/ags2ars.json.gz' \ 16 | | gunzip > "$DATA_DIR/ags2ars.json" 17 | 18 | echo 'Preparing data…' 19 | cp "$DIR/../../../de-zhv.zip" "$DATA_DIR/source.zip" 20 | unzip "$DATA_DIR/source.zip" -d "$DATA_DIR/unzipped" 21 | find "$DATA_DIR/unzipped/" -name "*.csv" -exec mv '{}' "$DATA_DIR/source.csv" \; 22 | 23 | pnpx csvtojson --delimiter=";" "$DATA_DIR/source.csv" > "$DATA_DIR/source.json" 24 | 25 | echo 'Adding ARS keys…' 26 | node "$DIR/convert.js" > $DATA_DIR/source-with-ars.json 27 | 28 | echo 'Applying mapping…' 29 | java -jar "$BIN_DIR/rmlmapper.jar" -m "$DIR/mapping.ttl" -s turtle --strict --base-iri "$BASE_IRI" > "$DATA_DIR/output.ttl" 30 | 31 | echo 'Compressing output…' 32 | cat "$DATA_DIR/output.ttl" | gzip > "$DATA_DIR/output.ttl.gz" 33 | 34 | # echo 'Done.' 35 | -------------------------------------------------------------------------------- /.github/workflows/legacy-proxy.yml: -------------------------------------------------------------------------------- 1 | name: Legacy proxy 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | build-and-push: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Set up QEMU 12 | uses: docker/setup-qemu-action@v3 13 | - name: Set up Docker Buildx 14 | uses: docker/setup-buildx-action@v3 15 | - name: Login to GitHub Container Registry 16 | uses: docker/login-action@v3 17 | with: 18 | registry: ghcr.io 19 | username: ${{ github.repository_owner }} 20 | password: ${{ secrets.GITHUB_TOKEN }} 21 | - name: Fetch commit hash 22 | id: hash 23 | run: echo "::set-output name=hash::$(echo $GITHUB_SHA | head -c7)" 24 | - name: Fetch current date and time 25 | id: datetime 26 | run: echo "::set-output name=datetime::$(date -u +'%Y-%m-%dT%H.%M.%SZ')" 27 | - name: Build and push 28 | uses: docker/build-push-action@v3 29 | with: 30 | file: ./legacy-proxy/Dockerfile 31 | platforms: linux/amd64,linux/arm64 32 | push: true 33 | tags: ghcr.io/${{github.repository}}-legacy-proxy:v1_${{steps.hash.outputs.hash}}_${{steps.datetime.outputs.datetime}} 34 | cache-from: type=gha 35 | cache-to: type=gha,mode=max 36 | -------------------------------------------------------------------------------- /src/scrapers/luxembourg.js: -------------------------------------------------------------------------------- 1 | import got from 'got' 2 | import lodash from 'lodash' 3 | 4 | const findLatestFeed = (response, license) => { 5 | if (response?.license !== license) throw new Error('unexpected license') 6 | const [latest] = lodash.sortBy((response?.resources || []).filter(r => r?.format === 'zip'), r => -new Date(r?.created_at)) 7 | if (!latest) throw new Error('no matching dataset found') 8 | 9 | const { created_at: createdAt, url } = latest 10 | if (!url || !createdAt) throw new Error('missing resource properties') 11 | 12 | // throw if latest file is older than 20 days 13 | if (+new Date() - (+new Date(createdAt)) > 20 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${createdAt}`) 14 | 15 | return url 16 | } 17 | 18 | export const luxembourgGtfs = async () => { 19 | const response = await got.get(new URL('https://data.public.lu/api/1/datasets/gtfs')).json() 20 | const latestUrl = findLatestFeed(response, 'cc-by') 21 | const stream = await got.stream.get(latestUrl) 22 | return stream.pipe(process.stdout) 23 | } 24 | 25 | export const luxembourgNetex = async () => { 26 | const response = await got.get(new URL('https://data.public.lu/api/1/datasets/horaires-et-arrets-des-transport-publics-netex/')).json() 27 | const latestUrl = findLatestFeed(response, 'cc-zero') 28 | const stream = await got.stream.get(latestUrl) 29 | return stream.pipe(process.stdout) 30 | } 31 | -------------------------------------------------------------------------------- /src/fetch.js: -------------------------------------------------------------------------------- 1 | const dataset = process.env.DATASET 2 | 3 | const main = async () => { 4 | if (['DE_NETEX', 'DE_GTFS', 'DE_ZHV', 'DE_NRW_GTFS', 'DE_HVV_GTFS'].includes(dataset)) { 5 | const [user, password] = [process.env.OPENDATA_OEPNV_EMAIL, process.env.OPENDATA_OEPNV_PASSWORD] 6 | if (typeof user !== 'string' || user.length === 0) throw new Error('env.OPENDATA_OEPNV_EMAIL must be a non-empty string') 7 | if (typeof password !== 'string' || password.length === 0) throw new Error('env.OPENDATA_OEPNV_PASSWORD must be a non-empty string') 8 | 9 | if (dataset === 'DE_NETEX') { 10 | const { netex } = await import('./scrapers/opendata-oepnv.js') 11 | await netex(user, password) 12 | return 13 | } 14 | if (dataset === 'DE_GTFS') { 15 | const { gtfs } = await import('./scrapers/opendata-oepnv.js') 16 | await gtfs(user, password) 17 | return 18 | } 19 | if (dataset === 'DE_ZHV') { 20 | const { zhv } = await import('./scrapers/opendata-oepnv.js') 21 | await zhv(user, password) 22 | return 23 | } 24 | if (dataset === 'DE_NRW_GTFS') { 25 | const { nrwGtfs } = await import('./scrapers/opendata-oepnv.js') 26 | await nrwGtfs(user, password) 27 | return 28 | } 29 | if (dataset === 'DE_HVV_GTFS') { 30 | const { hvvGtfs } = await import('./scrapers/opendata-oepnv.js') 31 | await hvvGtfs(user, password) 32 | return 33 | } 34 | } 35 | if (dataset === 'LU_GTFS') { 36 | const { luxembourgGtfs } = await import('./scrapers/luxembourg.js') 37 | await luxembourgGtfs() 38 | return 39 | } 40 | if (dataset === 'LU_NETEX') { 41 | const { luxembourgNetex } = await import('./scrapers/luxembourg.js') 42 | await luxembourgNetex() 43 | return 44 | } 45 | throw new Error(`unknown dataset: ${dataset}`) 46 | } 47 | 48 | main() 49 | .catch(error => { 50 | console.error(error) 51 | process.exit(1) 52 | }) 53 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "public-transport-data-scraper", 3 | "version": "0.0.0", 4 | "private": true, 5 | "description": "Scraper that re-publishes official german public transport datasets with stable URLs.", 6 | "homepage": "https://github.com/juliuste/public-transport-data-scraper", 7 | "bugs": "https://github.com/juliuste/public-transport-data-scraper/issues", 8 | "repository": "juliuste/public-transport-data-scraper", 9 | "license": "ISC", 10 | "author": "Julius Tens ", 11 | "type": "module", 12 | "scripts": { 13 | "check-deps": "depcheck", 14 | "fix": "npm run lint -- --fix", 15 | "lint": "eslint src", 16 | "fetch-de-gtfs": "DATASET=DE_GTFS FILE_NAME=de-gtfs.zip MINIMUM_SIZE_MB=100 ./src/run.sh", 17 | "fetch-de-hvv-gtfs": "DATASET=DE_HVV_GTFS FILE_NAME=de-hvv-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh", 18 | "fetch-de-netex": "DATASET=DE_NETEX FILE_NAME=de-netex.zip MINIMUM_SIZE_MB=500 ./src/run.sh", 19 | "fetch-de-nrw-gtfs": "DATASET=DE_NRW_GTFS FILE_NAME=de-nrw-gtfs.zip MINIMUM_SIZE_MB=30 ./src/run.sh", 20 | "fetch-de-zhv": "DATASET=DE_ZHV FILE_NAME=de-zhv.zip MINIMUM_SIZE_MB=10 ./src/run.sh", 21 | "fetch-lu-gtfs": "DATASET=LU_GTFS FILE_NAME=lu-gtfs.zip MINIMUM_SIZE_MB=3 ./src/run.sh", 22 | "fetch-lu-netex": "DATASET=LU_NETEX FILE_NAME=lu-netex.zip MINIMUM_SIZE_MB=10 ./src/run.sh", 23 | "fetch-se-gtfs": "DATASET=SE_GTFS FILE_NAME=se-gtfs.zip MINIMUM_SIZE_MB=25 ./src/run.sh", 24 | "test": "npm run lint && npm run check-deps" 25 | }, 26 | "dependencies": { 27 | "cheerio": "1.1.2", 28 | "got": "^14.6.5", 29 | "load-json-file": "^7.0.1", 30 | "lodash": "^4.17.21", 31 | "luxon": "^3.7.2" 32 | }, 33 | "devDependencies": { 34 | "depcheck": "^1.4.7", 35 | "eslint": "^8.57.1", 36 | "eslint-config-standard": "^17.1.0", 37 | "eslint-plugin-import": "^2.32.0", 38 | "eslint-plugin-n": "^16.6.2", 39 | "eslint-plugin-promise": "^6.6.0" 40 | }, 41 | "engines": { 42 | "node": ">=18" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/linked-data/zhv-de/mapping.ttl: -------------------------------------------------------------------------------- 1 | @prefix bahnhof: . 2 | @prefix codeforde: . 3 | @prefix zhv: . 4 | @prefix dct: . 5 | @prefix dbo: . 6 | @prefix foaf: . 7 | @prefix geo: . 8 | @prefix gn: . 9 | @prefix juso: . 10 | @prefix netex: . 11 | @prefix owl: . 12 | @prefix ql: . 13 | @prefix rdf: . 14 | @prefix rdfs: . 15 | @prefix rml: . 16 | @prefix rr: . 17 | @prefix schema: . 18 | @prefix status: . 19 | @prefix wdt: . 20 | @prefix xsd: . 21 | 22 | # level 1 (stop place) 23 | 24 | [ a rr:TriplesMap ] 25 | rml:logicalSource [ 26 | rml:source "data/source-with-ars.json"; 27 | rml:referenceFormulation ql:JSONPath; 28 | rml:iterator "$[?(@.Type == 'S')]" 29 | ]; 30 | rr:subjectMap [ 31 | rr:template "resource/by-key/ifopt/{.DHID}"; 32 | rr:class netex:StopPlace 33 | ]; 34 | rr:predicateObjectMap [ 35 | rr:predicate netex:topographicPlace; 36 | rr:objectMap [ 37 | rr:termType rr:IRI; 38 | rr:template "resource/by-key/ars-padded/{.arsPadded}" 39 | ]; 40 | ], [ 41 | rr:predicate bahnhof:ifoptStopId; 42 | rr:objectMap [ 43 | rr:datatype xsd:string; 44 | rr:termType rr:Literal; 45 | rml:reference "$.DHID" 46 | ] 47 | ], [ 48 | rr:predicate netex:name; 49 | rr:objectMap [ 50 | rr:datatype xsd:string; 51 | rr:termType rr:Literal; 52 | rml:reference "$.Name" 53 | ] 54 | ], [ 55 | rr:predicate zhv:lastOperationDate; 56 | rr:objectMap [ 57 | rr:datatype xsd:dateTime; 58 | rr:termType rr:Literal; 59 | rml:reference "$.LastOperationDate" 60 | ] 61 | ]. 62 | # todo: other attributes 63 | 64 | # todo: level 2 (quay - platform), level 3 (quay - platform edge) 65 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # public-transport-data-scraper 2 | 3 | Scraper that re-publishes official public transport datasets under stable URLs, since many transportation authorities and government agencies sadly don't provide any on their own, making it nearly impossible to integrate these datasets in automated systems. 4 | 5 | ## Scraped datasets 6 | 7 | Dataset | License | Attribution | Stable URL 8 | ------- | ------- | ----------- | ---------- 9 | [Germany-wide GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten-gtfs&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/gtfs.zip) 10 | [Germany-wide NETEX feed](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-sollfahrplandaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/netex.zip) 11 | [German public transport stop registry (ZHV)](https://www.opendata-oepnv.de/ht/de/organisation/delfi/startseite?tx_vrrkit_view%5Bdataset_name%5D=deutschlandweite-haltestellendaten&tx_vrrkit_view%5Baction%5D=details&tx_vrrkit_view%5Bcontroller%5D=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/zhv.zip) 12 | [North Rhine-Westphalia (NRW) GTFS feed](https://www.opendata-oepnv.de/ht/de/organisation/bundeslaender/nrw/startseite?tx_vrrkit_view[dataset_name]=soll-fahrplandaten-nrw&tx_vrrkit_view[action]=details&tx_vrrkit_view[controller]=View) (🇩🇪) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) | [opendata-oepnv.de](https://www.opendata-oepnv.de) | [latest](https://scraped.data.public-transport.earth/de/nrw-gtfs.zip) 13 | [Hamburger Verkehrsverbund (HVV) GTFS feed](https://suche.transparenz.hamburg.de/dataset?q=hvv%20gtfs&sort=score+desc%2Ctitle_sort+asc&esq_not_all_versions=true) (🇩🇪) | [DL-DE BY 2.0](https://www.govdata.de/dl-de/by-2-0) | [Hamburger Verkehrsverbund GmbH](https://www.hvv.de/) | [latest](https://scraped.data.public-transport.earth/de/hvv-gtfs.zip) 14 | [Luxembourg-wide GTFS feed](https://data.public.lu/en/datasets/horaires-et-arrets-des-transport-publics-gtfs/) (🇱🇺) | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) | [Administration des transports publics](https://mmtp.gouvernement.lu/de/annuaire.html?idMin=7854) | [latest](https://scraped.data.public-transport.earth/lu/gtfs.zip) 15 | [Luxembourg-wide NeTEx feed](https://data.public.lu/en/datasets/horaires-et-arrets-des-transport-publics-netex/) (🇱🇺) | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) | _[Administration des transports publics](https://mmtp.gouvernement.lu/de/annuaire.html?idMin=7854)_ | [latest](https://scraped.data.public-transport.earth/lu/netex.zip) 16 | [Sweden-wide GTFS feed](https://www.trafiklab.se/api/trafiklab-apis/gtfs-sverige-2/) (🇸🇪) | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) | _[Trafiklab](https://www.trafiklab.se/)_ | [latest](https://scraped.data.public-transport.earth/se/gtfs.zip) 17 | 18 | ## Contributing 19 | 20 | If you found a bug, want to propose a feed or add a new scraper, feel free to visit [the issues page](https://github.com/juliuste/public-transport-data-scraper/issues), or open a pull request. 21 | -------------------------------------------------------------------------------- /src/scrapers/opendata-oepnv.js: -------------------------------------------------------------------------------- 1 | // the data is fetched and processed in these steps: 2 | // 1. obtain a session cookie and the zip file's url 3 | // 2. download the data using the session cookie 4 | 5 | import { DateTime } from 'luxon' 6 | import * as cheerio from 'cheerio' 7 | import got from 'got' 8 | import lodash from 'lodash' 9 | import { basename } from 'path' 10 | 11 | const throwOnMultipleOrNoMatches = fileUrls => { 12 | if (fileUrls.length !== 1) throw new Error(`unexpected number of file urls: ${fileUrls.length}, probably internal error or invalid credentials.`) 13 | return fileUrls[0] 14 | } 15 | 16 | const extractUrlFromResponse = (html, isMatchingFile, selectBestMatch) => { 17 | const parsed = cheerio.load(html) 18 | const urls = Array.from(parsed('a').filter(function (i, el) { 19 | // this === el 20 | return isMatchingFile(parsed(this).attr('href') || '') 21 | })).map(a => parsed(a).attr('href')) 22 | const url = selectBestMatch(urls) 23 | return url 24 | } 25 | 26 | const fetchCookie = async (user, password) => { 27 | const url = new URL('https://www.opendata-oepnv.de/ht/de/willkommen') 28 | url.searchParams.append('tx_felogin_login[action]', 'login') 29 | url.searchParams.append('tx_felogin_login[controller]', 'Login') 30 | url.searchParams.append('cHash', '99c35a06ebc0db4f37f0bb93048bb79b') 31 | const response = await got.post(url, { 32 | form: { 33 | user, 34 | pass: password, 35 | submit: 'Anmelden', 36 | logintype: 'login', 37 | pid: '174@d6f42d5376399b9d6eee5cbcb5a06dcb1b489387', 38 | }, 39 | }) 40 | const cookie = (response.headers['set-cookie'] || []).find(c => c.includes('fe_typo_user')) 41 | if (!cookie) throw new Error('cookie not found. internal error or invalid credentials') 42 | return cookie 43 | } 44 | 45 | const fetchAndOutput = async (user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch = throwOnMultipleOrNoMatches) => { 46 | const cookie = await fetchCookie(user, password) 47 | 48 | const url = new URL(organisationPath, 'https://www.opendata-oepnv.de/ht/de/organisation/') 49 | url.searchParams.append('tx_vrrkit_view[dataset_name]', datasetName) 50 | url.searchParams.append('tx_vrrkit_view[action]', 'details') 51 | url.searchParams.append('tx_vrrkit_view[controller]', 'View') 52 | 53 | const response = await got.get(url, { headers: { Cookie: cookie } }) 54 | const fileUrl = extractUrlFromResponse(response.body, isMatchingFile, selectBestMatch) 55 | 56 | const stream = await got.stream.get(fileUrl, { headers: { Cookie: cookie } }) 57 | return stream.pipe(process.stdout) 58 | } 59 | 60 | export const gtfs = async (user, password) => { 61 | const organisationPath = 'delfi/startseite' 62 | const datasetName = 'deutschlandweite-sollfahrplandaten-gtfs' 63 | const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland_gtfs.zip') 64 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 65 | } 66 | 67 | export const netex = async (user, password) => { 68 | const organisationPath = 'delfi/startseite' 69 | const datasetName = 'deutschlandweite-sollfahrplandaten' 70 | const isMatchingFile = name => name.endsWith('_fahrplaene_gesamtdeutschland.zip') 71 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 72 | } 73 | 74 | export const zhv = async (user, password) => { 75 | const organisationPath = 'delfi/startseite' 76 | const datasetName = 'deutschlandweite-haltestellendaten' 77 | const isMatchingFile = name => name.endsWith('_zHV_gesamt.zip') 78 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile) 79 | } 80 | 81 | export const nrwGtfs = async (user, password) => { 82 | const organisationPath = 'bundeslaender/nrw/startseite' 83 | const datasetName = 'soll-fahrplandaten-nrw' 84 | const isMatchingFile = name => /\/nrw-gtfs-\d{2}-\d{4}\.zip$/.test(name) 85 | const selectBestMatch = urls => { 86 | const urlsWithDate = urls.map(url => { 87 | const fileName = basename(new URL(url).pathname) 88 | const rawDate = `28-${fileName.slice(-11).slice(0, 7)}` 89 | const date = DateTime.fromFormat(rawDate, 'dd-MM-yyyy').toJSDate() 90 | return { date, url } 91 | }) 92 | const latest = lodash.last(lodash.sortBy(urlsWithDate, ({ date }) => +date)) 93 | // throw if latest file is older than 90 days 94 | if (+new Date() - (+latest.date) > 90 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${latest.date}`) 95 | return latest.url 96 | } 97 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch) 98 | } 99 | 100 | export const hvvGtfs = async (user, password) => { 101 | const organisationPath = 'verkehrsverbuende/hvv/startseite' 102 | const datasetName = 'soll-fahrplandaten-hvv' 103 | const isMatchingFile = name => /\/hvv_rohdaten_gtfs_fpl_\d{8}\.zip$/.test(name) 104 | const selectBestMatch = urls => { 105 | const urlsWithDate = urls.map(url => { 106 | const fileName = basename(new URL(url).pathname) 107 | const rawDate = `${fileName.slice(-12).slice(0, 8)}` 108 | const date = DateTime.fromFormat(rawDate, 'yyyyMMdd').toJSDate() 109 | return { date, url } 110 | }) 111 | const latest = lodash.last(lodash.sortBy(urlsWithDate, ({ date }) => +date)) 112 | // throw if latest file is older than 45 days 113 | if (+new Date() - (+latest.date) > 45 * 24 * 60 * 60 * 1000) throw new Error(`latest dataset seems to be outdated: ${latest.date}`) 114 | return latest.url 115 | } 116 | await fetchAndOutput(user, password, organisationPath, datasetName, isMatchingFile, selectBestMatch) 117 | } 118 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | pull_request: 5 | schedule: 6 | # keep request limits in mind before increasing the cron frequency 7 | # * is a special character in YAML so you have to quote this string 8 | - cron: '0 2 * * *' 9 | jobs: 10 | test: 11 | runs-on: ubuntu-22.04 12 | steps: 13 | - name: Checkout main 14 | uses: actions/checkout@v4 15 | - uses: pnpm/action-setup@v3 16 | name: Set up pnpm 17 | with: 18 | version: 8 19 | - name: Set up Node 20 | uses: actions/setup-node@v4 21 | with: 22 | node-version: 20 23 | - name: Install dependencies 24 | run: pnpm install 25 | - name: Run tests 26 | run: pnpm test 27 | env: 28 | CI: true 29 | 30 | fetch-and-publish: 31 | runs-on: ubuntu-22.04 32 | environment: main 33 | needs: test 34 | if: github.ref == 'refs/heads/main' 35 | steps: 36 | - name: Checkout main 37 | uses: actions/checkout@v4 38 | - uses: actions/setup-java@v5 39 | with: 40 | distribution: 'temurin' # See 'Supported distributions' for available options 41 | java-version: '21' 42 | - name: Set up Python 3 43 | uses: actions/setup-python@v5 44 | with: 45 | python-version: 3 46 | - name: Log awscli version 47 | run: aws --version 48 | - name: Set up awscli configuration 49 | env: 50 | S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} 51 | S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} 52 | S3_ENDPOINT: ${{ secrets.S3_ENDPOINT }} 53 | run: | 54 | set -e; 55 | mkdir ~/.aws; 56 | echo " 57 | [default] 58 | aws_access_key_id=$S3_ACCESS_KEY_ID 59 | aws_secret_access_key=$S3_SECRET_ACCESS_KEY 60 | " > ~/.aws/credentials; 61 | echo " 62 | [default] 63 | endpoint_url = $S3_ENDPOINT 64 | s3 = 65 | multipart_threshold = 4GB 66 | multipart_chunksize = 4GB 67 | request_checksum_calculation = when_required 68 | " > ~/.aws/config; 69 | 70 | - uses: pnpm/action-setup@v3 71 | name: Set up pnpm 72 | with: 73 | version: 8 74 | - name: Set up Node 75 | uses: actions/setup-node@v4 76 | with: 77 | node-version: 20 78 | - name: Install node dependencies 79 | run: pnpm install 80 | 81 | - name: "Fetch and upload feed: DE_GTFS" 82 | if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing 83 | env: 84 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 85 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 86 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 87 | run: | 88 | set -e; 89 | npm run fetch-de-gtfs; 90 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/gtfs.zip || echo 'not-yet-existing'); 91 | newhash=$(cat de-gtfs.zip | md5sum); 92 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 93 | then 94 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read de-gtfs.zip s3://"$S3_BUCKET_NAME"/de/gtfs.zip 95 | else 96 | echo 'file unchanged, skipping.' 97 | fi; 98 | 99 | - name: "Fetch and upload feed: DE_ZHV" 100 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 101 | env: 102 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 103 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 104 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 105 | run: | 106 | set -e; 107 | npm run fetch-de-zhv; 108 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/zhv.zip || echo 'not-yet-existing'); 109 | newhash=$(cat de-zhv.zip | md5sum); 110 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 111 | then 112 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read de-zhv.zip s3://"$S3_BUCKET_NAME"/de/zhv.zip 113 | else 114 | echo 'file unchanged, skipping.' 115 | fi; 116 | 117 | - name: "Generate linked data: DE_ZHV" 118 | # todo: do not run if processing the dataset failed 119 | if: ${{ success() }} # this should allow the other steps to run, but should still mark the workflow as failing 120 | env: 121 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 122 | run: | 123 | set -e; 124 | ./src/linked-data/zhv-de/index.sh 125 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/zhv.ttl.gz || echo 'not-yet-existing'); 126 | newhash=$(cat ./src/linked-data/zhv-de/data/output.ttl.gz | md5sum); 127 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 128 | then 129 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read ./src/linked-data/zhv-de/data/output.ttl.gz s3://"$S3_BUCKET_NAME"/de/zhv.ttl.gz 130 | else 131 | echo 'file unchanged, skipping.' 132 | fi; 133 | 134 | - name: "Fetch and upload feed: DE_NETEX" 135 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 136 | env: 137 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 138 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 139 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 140 | run: | 141 | set -e; 142 | npm run fetch-de-netex; 143 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/netex.zip || echo 'not-yet-existing'); 144 | newhash=$(cat de-netex.zip | md5sum); 145 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 146 | then 147 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read de-netex.zip s3://"$S3_BUCKET_NAME"/de/netex.zip 148 | else 149 | echo 'file unchanged, skipping.' 150 | fi; 151 | 152 | - name: "Fetch and upload feed: DE_NRW_GTFS" 153 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 154 | env: 155 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 156 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 157 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 158 | run: | 159 | set -e; 160 | npm run fetch-de-nrw-gtfs; 161 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/nrw-gtfs.zip || echo 'not-yet-existing'); 162 | newhash=$(cat de-nrw-gtfs.zip | md5sum); 163 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 164 | then 165 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read de-nrw-gtfs.zip s3://"$S3_BUCKET_NAME"/de/nrw-gtfs.zip 166 | else 167 | echo 'file unchanged, skipping.' 168 | fi; 169 | 170 | - name: "Fetch and upload feed: DE_HVV_GTFS" 171 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 172 | env: 173 | OPENDATA_OEPNV_EMAIL: ${{ secrets.OPENDATA_OEPNV_EMAIL }} 174 | OPENDATA_OEPNV_PASSWORD: ${{ secrets.OPENDATA_OEPNV_PASSWORD }} 175 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 176 | run: | 177 | set -e; 178 | npm run fetch-de-hvv-gtfs; 179 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key de/hvv-gtfs.zip || echo 'not-yet-existing'); 180 | newhash=$(cat de-hvv-gtfs.zip | md5sum); 181 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 182 | then 183 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read de-hvv-gtfs.zip s3://"$S3_BUCKET_NAME"/de/hvv-gtfs.zip 184 | else 185 | echo 'file unchanged, skipping.' 186 | fi; 187 | 188 | - name: "Fetch and upload feed: LU_GTFS" 189 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 190 | env: 191 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 192 | run: | 193 | set -e; 194 | npm run fetch-lu-gtfs; 195 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key lu/gtfs.zip || echo 'not-yet-existing'); 196 | newhash=$(cat lu-gtfs.zip | md5sum); 197 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 198 | then 199 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read lu-gtfs.zip s3://"$S3_BUCKET_NAME"/lu/gtfs.zip 200 | else 201 | echo 'file unchanged, skipping.' 202 | fi; 203 | 204 | - name: "Fetch and upload feed: LU_NETEX" 205 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 206 | env: 207 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 208 | run: | 209 | set -e; 210 | npm run fetch-lu-netex; 211 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key lu/netex.zip || echo 'not-yet-existing'); 212 | newhash=$(cat lu-netex.zip | md5sum); 213 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 214 | then 215 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read lu-netex.zip s3://"$S3_BUCKET_NAME"/lu/netex.zip 216 | else 217 | echo 'file unchanged, skipping.' 218 | fi; 219 | 220 | - name: "Fetch and upload feed: SE_GTFS" 221 | if: ${{ success() || failure() }} # this should allow the other steps to run, but should still mark the workflow as failing 222 | env: 223 | TRAFIKLAB_API_KEY: ${{ secrets.TRAFIKLAB_API_KEY }} 224 | S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} 225 | run: | 226 | set -e; 227 | npm run fetch-se-gtfs; 228 | currentobj=$(aws s3api head-object --bucket $S3_BUCKET_NAME --key se/gtfs.zip || echo 'not-yet-existing'); 229 | newhash=$(cat se-gtfs.zip | md5sum); 230 | if [ $(echo $currentobj | grep $newhash | wc -l) -ne 1 ] 231 | then 232 | aws s3 cp --checksum-algorithm=CRC32 --acl public-read se-gtfs.zip s3://"$S3_BUCKET_NAME"/se/gtfs.zip 233 | else 234 | echo 'file unchanged, skipping.' 235 | fi; 236 | --------------------------------------------------------------------------------