├── .github └── workflows │ └── run-task.yml ├── .gitignore ├── LICENSE ├── README.md ├── db.json ├── package-lock.json ├── package.json └── src ├── main.js └── utils ├── api.js └── utils.js /.github/workflows/run-task.yml: -------------------------------------------------------------------------------- 1 | name: Run Task 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: "*/30 * * * *" 6 | 7 | jobs: 8 | build-and-deploy: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2.4.0 14 | with: 15 | ref: ${{ github.head_ref }} 16 | 17 | - name: Running 18 | run: | 19 | npm cache clean --force 20 | npm install 21 | npm run task 22 | env: 23 | CHAT_ID: ${{ secrets.CHAT_ID }} 24 | BOT_API: ${{ secrets.BOT_API }} 25 | 26 | - uses: stefanzweifel/git-auto-commit-action@v4 27 | with: 28 | commit_message: Update db.json 29 | branch: main 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | .idea 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 blopa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # funda-scraper 2 | 3 | A simple node.js script that goes to funda.nl and get the last day(s) of listings and push it to a Telegram chat via the Telegram Bot API. 4 | 5 | Read more about it here: https://javascript.plainenglish.io/using-node-js-and-github-action-to-find-a-house-on-the-web-ae03ed64670a 6 | 7 | ## How to use 8 | 1 - Add a search URL with your filters to the array `urls` in the file `main.js` around line 18 9 | 2 - Create an `CHAT_ID` and `BOT_API` environment variables with your Telegram Chat ID and Telegram Bot API key. [Check here how to create a Telegram Bot API key](https://core.telegram.org/bots/faq#how-do-i-create-a-bot) and [here how to find chat ID in Telegram](https://www.google.com/search?q=how+to+find+chat+id+in+telegram). 10 | 11 | ## Using Github Actions 12 | You can simply clone this project and enable Github Actions to your cloned projects and the script will start running every 30 minutes. Don't forget to add the `CHAT_ID` and `BOT_API` environment variables to Github Actions, [check here how to that](https://docs.github.com/en/actions/security-guides/encrypted-secrets). 13 | 14 | ## License 15 | MIT License 16 | 17 | Copyright (c) 2022 blopa 18 | 19 | Permission is hereby granted, free of charge, to any person obtaining a copy 20 | of this software and associated documentation files (the "Software"), to deal 21 | in the Software without restriction, including without limitation the rights 22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | copies of the Software, and to permit persons to whom the Software is 24 | furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all 27 | copies or substantial portions of the Software. 28 | 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | SOFTWARE. 36 | 37 | -------------------------------------------------------------------------------- /db.json: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "funda-scraper", 3 | "version": "0.1.0", 4 | "description": "", 5 | "main": "src/main.js", 6 | "scripts": { 7 | "task": "node -r esm src/main.js", 8 | "bump": "ncu -u -t minor" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/blopa/funda-scraper.git" 13 | }, 14 | "author": "", 15 | "license": "MIT", 16 | "bugs": { 17 | "url": "https://github.com/blopa/funda-scraper/issues" 18 | }, 19 | "homepage": "https://github.com/blopa/funda-scraper#readme", 20 | "dependencies": { 21 | "dotenv": "^16.0.0", 22 | "esm": "^3.2.25", 23 | "jsdom": "^19.0.0", 24 | "node-fetch": "2.6.7", 25 | "npm-check-updates": "^12.5.3", 26 | "puppeteer": "^13.5.1" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | const { writeFileSync, readFileSync } = require('fs'); 3 | const puppeteer = require('puppeteer'); 4 | const jsdom = require('jsdom'); 5 | const nodeFetch = require('node-fetch'); 6 | const { getZipCode, getNeighbourhoodData, convertResidentsToPercentage} = require('./utils/utils'); 7 | 8 | const WIDTH = 1920; 9 | const HEIGHT = 1080; 10 | 11 | const data = readFileSync('db.json', { encoding:'utf8', flag: 'r' }); 12 | const pastResults = new Set(JSON.parse(data) || []); 13 | console.log('pastResults:', pastResults); 14 | const newResults = new Set(); 15 | const houses = []; 16 | const { CHAT_ID, BOT_API } = process.env; 17 | 18 | const urls = [ 19 | 'https://www.funda.nl/en/koop/amsterdam/beschikbaar/0-300000/40+woonopp/2+slaapkamers/1-dag/', 20 | 'https://www.funda.nl/en/koop/haarlem/beschikbaar/0-300000/40+woonopp/2+slaapkamers/1-dag/', 21 | ]; 22 | 23 | const runTask = async () => { 24 | for (const url of urls) { 25 | await runPuppeteer(url); 26 | } 27 | 28 | console.log('newResults:', newResults); 29 | 30 | if (newResults.size > 0) { 31 | writeFileSync('db.json', JSON.stringify(Array.from([ 32 | ...newResults, 33 | ...pastResults, 34 | ]))); 35 | 36 | console.log('sending messages to Telegram'); 37 | const date = (new Date()).toISOString().split('T')[0]; 38 | houses.forEach(({ 39 | path, 40 | income, 41 | residentsAge0to14, 42 | residentsAge15to24, 43 | residentsAge25to44, 44 | residentsAge45to64, 45 | residentsAge65AndOlder, 46 | householdsWithChildren, 47 | shareOfMorocco, 48 | shareOfAntillesOrAruba, 49 | shareOfSuriname, 50 | shareOfTurkey, 51 | neighbourhoodName, 52 | municipalityName, 53 | shareOfNonImmigrants, 54 | residentsCount, 55 | totalImmigrantsCount, 56 | }) => { 57 | let text = `New house on ${date}: [click here](${path})`; 58 | 59 | if (income) { 60 | let extraStuff = ` 61 | residentsIncome: **${income}** 62 | neighbourhoodName: **${neighbourhoodName}** 63 | municipalityName: **${municipalityName}** 64 | residentsAge0to14: **${residentsAge0to14}** 65 | residentsAge15to24: **${residentsAge15to24}** 66 | residentsAge25to44: **${residentsAge25to44}** 67 | residentsAge45to64: **${residentsAge45to64}** 68 | residentsAge65AndOlder: **${residentsAge65AndOlder}** 69 | householdsWithChildren: **${householdsWithChildren}** 70 | residentsCount: **${residentsCount}** 71 | totalImmigrantsCount: **${totalImmigrantsCount}** 72 | shareOfNonImmigrants: **${shareOfNonImmigrants}** 73 | shareOfMorocco: **${shareOfMorocco}** 74 | shareOfAntillesOrAruba: **${shareOfAntillesOrAruba}** 75 | shareOfSuriname: **${shareOfSuriname}** 76 | shareOfTurkey: **${shareOfTurkey}** 77 | `; 78 | text = `${text}\n${extraStuff}`; 79 | } 80 | 81 | nodeFetch(`https://api.telegram.org/bot${BOT_API}/sendMessage`, { 82 | method: 'POST', 83 | headers: { 84 | 'Content-Type': 'application/json', 85 | }, 86 | body: JSON.stringify({ 87 | text, 88 | chat_id : CHAT_ID, 89 | parse_mode : 'markdown', 90 | }), 91 | }); 92 | }); 93 | } 94 | }; 95 | 96 | const runPuppeteer = async (url) => { 97 | console.log('opening headless browser'); 98 | const browser = await puppeteer.launch({ 99 | headless: true, 100 | args: [`--window-size=${WIDTH},${HEIGHT}`], 101 | defaultViewport: { 102 | width: WIDTH, 103 | height: HEIGHT, 104 | }, 105 | }); 106 | 107 | const page = await browser.newPage(); 108 | // https://stackoverflow.com/a/51732046/4307769 https://stackoverflow.com/a/68780400/4307769 109 | await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'); 110 | 111 | console.log('going to funda'); 112 | await page.goto(url, { waitUntil: 'domcontentloaded' }); 113 | 114 | const htmlString = await page.content(); 115 | const dom = new jsdom.JSDOM(htmlString); 116 | 117 | 118 | console.log('parsing funda.nl data'); 119 | const result = dom.window.document.querySelectorAll('.search-result'); 120 | for (const element of result) { 121 | const urlPath = element?.querySelectorAll('a')?.[0]?.href; 122 | const headerSubtitle = element?.querySelector('.search-result__header-subtitle'); 123 | const subtitleText = headerSubtitle?.innerHTML?.trim(); 124 | 125 | let path = urlPath; 126 | if (!path.includes('https://www.funda.nl')) { 127 | path = `https://www.funda.nl${urlPath}`; 128 | } 129 | 130 | path = path.replace('?navigateSource=resultlist', ''); 131 | if (path && !pastResults.has(path) && !newResults.has(path)) { 132 | let extraDetails = {}; 133 | const zipCode = getZipCode(subtitleText || ''); 134 | 135 | if (zipCode) { 136 | const neighbourhoodData = await getNeighbourhoodData(zipCode); 137 | 138 | if (neighbourhoodData) { 139 | const residentsCount = neighbourhoodData?.['AantalInwoners_5']?.value || 0; 140 | const westernImmigrantsCount = neighbourhoodData?.['WestersTotaal_17']?.value || 0; 141 | const nonWesternImmigrantsCount = neighbourhoodData?.['NietWestersTotaal_18']?.value || 0; 142 | const totalImmigrantsCount = westernImmigrantsCount + nonWesternImmigrantsCount; 143 | const income = neighbourhoodData?.['GemiddeldInkomenPerInwoner_66']?.value * 1000; 144 | 145 | extraDetails = { 146 | ...extraDetails, 147 | income, 148 | residentsAge0to14: neighbourhoodData['k_0Tot15Jaar_8'].value, 149 | residentsAge15to24: neighbourhoodData['k_15Tot25Jaar_9'].value, 150 | residentsAge25to44: neighbourhoodData['k_25Tot45Jaar_10'].value, 151 | residentsAge45to64: neighbourhoodData['k_45Tot65Jaar_11'].value, 152 | residentsAge65AndOlder: neighbourhoodData['k_65JaarOfOuder_12'].value, 153 | householdsWithChildren: neighbourhoodData['HuishoudensMetKinderen_31'].value, 154 | totalImmigrantsCount, 155 | shareOfMorocco: convertResidentsToPercentage(residentsCount, neighbourhoodData['Marokko_19'].value), 156 | shareOfAntillesOrAruba: convertResidentsToPercentage(residentsCount, neighbourhoodData['NederlandseAntillenEnAruba_20'].value), 157 | shareOfSuriname: convertResidentsToPercentage(residentsCount, neighbourhoodData['Suriname_21'].value), 158 | shareOfTurkey: convertResidentsToPercentage(residentsCount, neighbourhoodData['Turkije_22'].value), 159 | shareOfNonImmigrants: convertResidentsToPercentage(residentsCount, residentsCount - totalImmigrantsCount), 160 | neighbourhoodName: neighbourhoodData.neighbourhoodName.value, 161 | municipalityName: neighbourhoodData.municipalityName.value, 162 | residentsCount, 163 | }; 164 | } 165 | } 166 | 167 | newResults.add(path); 168 | houses.push({ 169 | ...extraDetails, 170 | path, 171 | }); 172 | } 173 | } 174 | 175 | console.log('closing browser'); 176 | await browser.close(); 177 | }; 178 | 179 | if (CHAT_ID && BOT_API) { 180 | runTask(); 181 | } else { 182 | console.log('Missing Telegram API keys!'); 183 | } 184 | -------------------------------------------------------------------------------- /src/utils/api.js: -------------------------------------------------------------------------------- 1 | // Code from https://github.com/nikitaindik/funda-neighbourhoods/blob/de9b65b255a4c03a9ddb581e1472f6970240d9f7/src/background/api.js 2 | import nodeFetch from 'node-fetch'; 3 | 4 | const STATS_API_ID_BY_YEAR = { 5 | 2015: '83220NED', 6 | 2016: '83487NED', 7 | 2017: '83765NED', 8 | 2018: '84286NED', 9 | 2019: '84583NED', 10 | 2020: '84799NED', 11 | 2021: '85039NED', 12 | }; 13 | 14 | export async function fetchNeighbourhoodMeta(zipCode) { 15 | const parameters = { 16 | q: zipCode, 17 | fq: 'type:adres', 18 | rows: 1, 19 | }; 20 | 21 | const urlParametersString = getParametersString(parameters); 22 | 23 | const response = await nodeFetch(`https://geodata.nationaalgeoregister.nl/locatieserver/v3/free?${urlParametersString}`); 24 | 25 | const responseJson = await response.json(); 26 | 27 | try { 28 | const firstPayloadItem = responseJson.response.docs[0]; 29 | return { 30 | neighbourhoodCode: firstPayloadItem.buurtcode, 31 | neighbourhoodName: firstPayloadItem.buurtnaam, 32 | municipalityName: firstPayloadItem.gemeentenaam, 33 | }; 34 | } catch (error) { 35 | return null; 36 | } 37 | } 38 | 39 | export async function fetchNeighbourhoodStats(neighbourhoodCode) { 40 | const neighbourhoodStatsWithYears = await getNeighbourhoodStatsWithYears(neighbourhoodCode); 41 | 42 | return mergeYearlyData(neighbourhoodStatsWithYears); 43 | } 44 | 45 | async function getNeighbourhoodStatsWithYears(neighbourhoodCode) { 46 | const years = Object.keys(STATS_API_ID_BY_YEAR); 47 | 48 | const requests = years.map(async year => { 49 | const apiId = STATS_API_ID_BY_YEAR[year]; 50 | 51 | const neighbourhoodDataForYear = await fetchDataForYear(apiId, neighbourhoodCode); 52 | 53 | if (!neighbourhoodDataForYear) { 54 | console.error('Failed to fetch neighbourhood stats for year:', year, 'apiId:', apiId); 55 | return null; 56 | } 57 | 58 | return processNeighbourhoodDataFromApi(year, neighbourhoodDataForYear); 59 | }); 60 | 61 | const yearlyDataForNeighbourhood = await Promise.all(requests); 62 | 63 | return yearlyDataForNeighbourhood.filter(dataForYear => dataForYear !== null); 64 | } 65 | 66 | async function fetchDataForYear(apiId, neighbourhoodCode) { 67 | const parameters = `$filter=WijkenEnBuurten eq '${neighbourhoodCode}'`; 68 | const requestUrl = `https://opendata.cbs.nl/ODataApi/odata/${apiId}/TypedDataSet?${parameters}`; 69 | 70 | try { 71 | const response = await nodeFetch(requestUrl); 72 | const responseJson = await response.json(); 73 | return responseJson.value[0]; 74 | } catch (error) { 75 | return null; 76 | } 77 | } 78 | 79 | function mergeYearlyData(yearlyData) { 80 | return Object.assign({}, ...yearlyData); 81 | } 82 | 83 | function removeEmptyFields(dataForYear) { 84 | const entries = Object.entries(dataForYear); 85 | const nonEmptyEntries = entries.filter(([, value]) => value !== null); 86 | 87 | return Object.fromEntries(nonEmptyEntries); 88 | } 89 | 90 | function addYearToEveryField(dataForYear, year) { 91 | const entries = Object.entries(dataForYear); 92 | const entriesWithYears = entries.map(([fieldName, fieldValue]) => [ 93 | fieldName, 94 | { year: Number(year), value: fieldValue }, 95 | ]); 96 | 97 | return Object.fromEntries(entriesWithYears); 98 | } 99 | 100 | function processNeighbourhoodDataFromApi(year, dataForYear) { 101 | const withoutEmptyFields = removeEmptyFields(dataForYear); 102 | 103 | return addYearToEveryField(withoutEmptyFields, year); 104 | } 105 | 106 | function getParametersString(parameters) { 107 | return Object.entries(parameters) 108 | .map(([name, value]) => `${name}=${encodeURIComponent(value)}`) 109 | .join('&'); 110 | } 111 | -------------------------------------------------------------------------------- /src/utils/utils.js: -------------------------------------------------------------------------------- 1 | // Code mostly from https://github.com/nikitaindik/funda-neighbourhoods/blob/de9b65b255a4c03a9ddb581e1472f6970240d9f7/src/content/content.js#L17 2 | import { fetchNeighbourhoodMeta, fetchNeighbourhoodStats } from './api'; 3 | 4 | export const convertResidentsToPercentage = (residentsCount, categoryCount) => { 5 | const shareOfResidents = categoryCount / residentsCount; 6 | const integerPercentage = Math.round(shareOfResidents * 100); 7 | return `${categoryCount} (${integerPercentage}%)`; 8 | } 9 | 10 | export const getZipCode = (elementText) => { 11 | const zipCodeRe = /\d\d\d\d\s*[A-Z][A-Z]/; 12 | const match = elementText.match(zipCodeRe); 13 | 14 | if (match[0]) { 15 | return match[0].replaceAll(' ', ''); 16 | } 17 | 18 | return null; 19 | } 20 | 21 | export const getNeighbourhoodData = async (zipCode) => { 22 | const neighbourhoodMeta = await fetchNeighbourhoodMeta(zipCode); 23 | if (!neighbourhoodMeta) { 24 | return null; 25 | } 26 | 27 | const { neighbourhoodCode, neighbourhoodName, municipalityName } = neighbourhoodMeta; 28 | const neighbourhood = await fetchNeighbourhoodStats(neighbourhoodCode); 29 | if (!neighbourhood) { 30 | return null; 31 | } 32 | 33 | return { 34 | neighbourhoodName: { value: neighbourhoodName }, 35 | municipalityName: { value: municipalityName }, 36 | ...neighbourhood, 37 | }; 38 | } 39 | --------------------------------------------------------------------------------