├── .codecov.yml ├── .gitignore ├── .github ├── ISSUE_TEMPLATE │ ├── dependencies-update.md │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── ci.yml │ └── scrape.yml ├── conf ├── requiredScrapers.json ├── removalMap.json ├── replacementMap.json └── bankHolidays.json ├── lib ├── bankHolidayChecker.test.js ├── ocrSpaceApiSimple.mock.json ├── objectDecider.test.js ├── objectDecider.js ├── bankHolidayChecker.js ├── mongoDbInsertMany.js ├── stringValueCleaner.test.js ├── ocrSpaceApiSimple.js ├── priceCatcher.js ├── stringValueCleaner.js ├── dateCatcher.test.js ├── priceCompareToDb.js ├── mongoDbSearch.js ├── priceCatcher.test.js ├── ocrSpaceApiSimple.test.js └── dateCatcher.js ├── src ├── date.test.js ├── restaurantMenuClasses.test.js ├── date.js ├── restaurantMenuClasses.js ├── server.js └── dailyMenuScraper.js ├── package.json ├── .eslintrc ├── LICENSE.md ├── scrapers ├── pestiDiszno.js ├── kata.js ├── drop.js ├── incognito.js ├── roza.js ├── fruccola.js ├── kamra.js ├── karcsi.js ├── vian.js ├── bodza.js ├── yamato.js ├── ketszerecsen.js ├── menza.js ├── bank3.js ├── suppe.js ├── mozsar.js ├── korhely.js ├── i55.js ├── ocrFacebookImage.js └── nokedli.js └── README.md /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | patch: 4 | default: 5 | target: 80% 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | coverage/ 3 | .local-chromium/ 4 | log/ 5 | *.png 6 | *.jpg 7 | *.pdf 8 | *.env 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/dependencies-update.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Dependencies update 3 | about: Upgrade (or downgrade) them all! 4 | title: "[DEPENDENCIES] descriptive title" 5 | labels: dependencies 6 | assignees: theDavidBarton 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /conf/requiredScrapers.json: -------------------------------------------------------------------------------- 1 | { 2 | "scrapers": { 3 | "active": ["menza", "bank3", "i55", "kata", "ketszerecsen", "bodza", "korhely", "vian", "kamra"], 4 | "inactive": ["nokedli", "incognito", "mozsar", "pestiDiszno", "roza", "yamato", "karcsi", "drop", "suppe", "fruccola"] 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /conf/removalMap.json: -------------------------------------------------------------------------------- 1 | { 2 | "remove": "h(e|é)tf(o|ö|ő|å)(.|\\r?\\n|$)|kedd(.|\\r?\\n|$)|szerda(.|\\r?\\n|$)|cs.t(.*)(o|ö)k(.|\\r?\\n|$)|csiit.rt.k(.|\\r?\\n|$)|p(e|é)ntek(.|\\r?\\n|$)|\\bheti menü|(\\bmenü(.*)menüii)|[0-9]+ ft|,+|:|a napi |🌞|🥗|🍲|🥪|🥧|❤️|\\(\\)|ebédelj|LEVESEK|FŐÉTELEK|DESSZERTEK|finomságaink|salátáink|továbbiak|róza" 3 | } 4 | -------------------------------------------------------------------------------- /lib/bankHolidayChecker.test.js: -------------------------------------------------------------------------------- 1 | const bankHolidayChecker = require('./../lib/bankHolidayChecker') 2 | 3 | jest.mock('moment', () => () => ({ format: () => '2019-12-24' })) 4 | 5 | describe('bank holiday checker', function() { 6 | test('should recognize christmas', function() { 7 | let bankHoliday = bankHolidayChecker.bankHolidayChecker() 8 | expect(bankHoliday).toBe(true) 9 | }) 10 | }) 11 | -------------------------------------------------------------------------------- /src/date.test.js: -------------------------------------------------------------------------------- 1 | process.argv[3] = '1__2019.12.11.' // force cli argument like in case of --debug date=1__2019.12.11. to cover all conditions 2 | const date = require('./date').date 3 | 4 | describe('date', function() { 5 | test('should return valid date object', function() { 6 | expect(date).toBeTruthy() 7 | expect(date.todayDotSeparated).toBe('2019.12.11.') 8 | expect(date.today).toBe('1') 9 | }) 10 | }) 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] descriptive title" 5 | labels: bug 6 | assignees: theDavidBarton 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Additional context** 23 | Add any other context about the problem here. 24 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@master 16 | - name: Use Node.js 16.x 17 | uses: actions/setup-node@v1 18 | with: 19 | node-version: 16.x 20 | - name: Lint and Test 21 | run: | 22 | yarn 23 | yarn lint . 24 | yarn test --silent 25 | - name: Upload coverage to Codecov 26 | uses: codecov/codecov-action@v1 27 | with: 28 | token: ${{ secrets.CODECOV_TOKEN }} 29 | -------------------------------------------------------------------------------- /lib/ocrSpaceApiSimple.mock.json: -------------------------------------------------------------------------------- 1 | { 2 | "ParsedResults": [ 3 | { 4 | "TextOverlay": { 5 | "Lines": [], 6 | "HasOverlay": false, 7 | "Message": "Text overlay is not provided as it is not requested" 8 | }, 9 | "TextOrientation": "0", 10 | "FileParseExitCode": 1, 11 | "ParsedText": "I ❤ \r\nGitHuB and coding in NodeJs \r\nthis response is coming from \r\nmock (via Nock). \r\n", 12 | "ErrorMessage": ["mocked error"], 13 | "ErrorDetails": "" 14 | } 15 | ], 16 | "OCRExitCode": 1, 17 | "IsErroredOnProcessing": false, 18 | "ProcessingTimeInMilliseconds": "334", 19 | "SearchablePDFURL": "Searchable PDF not generated as it was not requested." 20 | } 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] descriptive title" 5 | labels: enhancement 6 | assignees: theDavidBarton 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "license": "MIT", 3 | "main": "./src/dailyMenuScraper.js", 4 | "dependencies": { 5 | "cors": "^2.8.5", 6 | "express": "^4.17.1", 7 | "moment": "^2.24.0", 8 | "mongodb": "^3.3.4", 9 | "puppeteer": "^19.7.1", 10 | "request": "^2.88.0" 11 | }, 12 | "devDependencies": { 13 | "codecov": "^3.7.1", 14 | "eslint": "^7.0.0", 15 | "jest": "^26.0.1", 16 | "nock": "^13.0.2" 17 | }, 18 | "scripts": { 19 | "start": "node ./src/server.js", 20 | "scrape": "node ./src/dailyMenuScraper.js", 21 | "scrape-debug": "node ./src/dailyMenuScraper.js --debug", 22 | "test": "jest --verbose --runInBand --detectOpenHandles", 23 | "lint": "eslint" 24 | }, 25 | "engines": { 26 | "node": "16.x" 27 | }, 28 | "jest": { 29 | "coverageDirectory": "./coverage/", 30 | "collectCoverage": true 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "node": true, 4 | "commonjs": true, 5 | "es6": true 6 | }, 7 | "parserOptions": { 8 | "ecmaVersion": 9 9 | }, 10 | "rules": { 11 | "max-len": [1, { "code": 120, "tabWidth": 4, "ignoreUrls": true, "ignoreComments": true, "ignoreStrings": true }], 12 | "semi": [2, "never"], 13 | "quotes": [2, "single"], 14 | "comma-dangle": [2, "never"], 15 | "object-curly-spacing": [2, "always"], 16 | "indent": [2, 2, { "SwitchCase": 1 }], 17 | "capitalized-comments": [2, "never"], 18 | "spaced-comment": [2, "always"], 19 | "multiline-comment-style": [2, "starred-block"], 20 | "no-multiple-empty-lines": [2, { "max": 2 }], 21 | "no-mixed-spaces-and-tabs": 2, 22 | "no-trailing-spaces": 2, 23 | "no-var": 2, 24 | "no-redeclare": 2, 25 | "no-const-assign": 2, 26 | "no-self-assign": 2, 27 | "no-unused-vars": [2, { "args": "none" }] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 David Barton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/objectDecider.test.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000) 2 | 3 | const objectDecider = require('./../lib/objectDecider') 4 | let decision 5 | 6 | describe('object decider', function() { 7 | test('should return true if content seems ok', async function() { 8 | decision = objectDecider.objectDecider('Burger, salad, coke') 9 | expect(decision).toBe(true) 10 | }) 11 | test('should return false if content seems not ok', async function() { 12 | decision = objectDecider.objectDecider(null) 13 | expect(decision).toBe(false) 14 | }) 15 | test('should return false if content seems not ok', async function() { 16 | decision = objectDecider.objectDecider('• Daily menu: ♪"No Milk Today"♫') 17 | expect(decision).toBe(false) 18 | }) 19 | test('should return false if content is out-of-date', async function() { 20 | decision = objectDecider.objectDecider('• Daily menu is outdated!') 21 | expect(decision).toBe(false) 22 | }) 23 | test('should return false if content is out-of-date (variation no.2)', async function() { 24 | decision = objectDecider.objectDecider('• Daily menu is out of date!') 25 | expect(decision).toBe(false) 26 | }) 27 | }) 28 | -------------------------------------------------------------------------------- /conf/replacementMap.json: -------------------------------------------------------------------------------- 1 | { 2 | "rules": { 3 | "alma": "al\\.ma", 4 | "á": "¿|ä|\\.í", 5 | "gombóc": "gomiióc", 6 | "comb": "co\\.mb", 7 | "cs": "c,s", 8 | "chili": "ciiili", 9 | "csirke": "c.sirkf.|c:s1rkf.|cs.rke|csirkf,", 10 | "e": "f\\.|f,", 11 | "é": "é\\.|ě", 12 | "es": "f,s", 13 | "fahéj": "faiiéj", 14 | "főétel": "fóétel", 15 | "főzelék": "f.ze\\/ék|ii,'özei\\.é\\.k,|fózelék|ĺő•elŕk|.ő.el.k", 16 | "füstölt": "fustblt|fostólt|fústolt|fiistölt", 17 | "fűszer": "fészer", 18 | "hús": "hcs|11ts|iiús|iiűs|iiľs", 19 | "hagym": "hagvm|hac,ym|hagy\\.m", 20 | "ggy": "ggv", 21 | "guly": "gulv", 22 | "gyümölcs": "gyijm.lcs|gyiimiilcs|qvüm.lcs", 23 | "jázmin": "jáz.iin|iá.zmin", 24 | "körte": "kórte", 25 | "krém": "krén|k(ř|r)éh|krěm|krčrn", 26 | "l": "i\\.\\.|i\\.|1\\.", 27 | "leves": "\\/eves|lewes|\\/ewes|lf,ves|levfs", 28 | "mártás": "mártós|m\\.írtäs", 29 | "nokedli": "\\/\\S+(Zö|ďö)", 30 | "ös": "0s", 31 | "penne": "pf,nni•:", 32 | "pörkölt": "pbrkblt", 33 | "püré": "ptré|pcré", 34 | "r": "ř", 35 | "rizs": "rižs|itłzs", 36 | "saláta": "sal,íta", 37 | "sült": "siilt|sclt|solt|sijlt|sti,t|st1,t", 38 | "sütőtök": "sťľö•ľök", 39 | "t": "ľ", 40 | "tárkony": "\\bt(\\.ĺ1t|ar)kony", 41 | "tokány": "tokáxv\\.", 42 | "töltve": "tóltve", 43 | "tortill": "torti\\/\\/", 44 | "zöld": "zóld|zöli\\)", 45 | "u": "t;|u\\." 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /lib/objectDecider.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | function objectDecider(valueString) { 29 | let decision = false 30 | if ( 31 | valueString !== null && 32 | !valueString.includes('♪"No Milk Today"♫') && 33 | !valueString.includes('menu is outdated!') && 34 | !valueString.includes('menu is out of date!') 35 | ) { 36 | decision = true 37 | } 38 | return decision 39 | } 40 | module.exports.objectDecider = objectDecider 41 | -------------------------------------------------------------------------------- /lib/bankHolidayChecker.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | const moment = require('moment') 29 | const bankHolidays = require('./../conf/bankHolidays.json') 30 | 31 | function bankHolidayChecker() { 32 | let found 33 | try { 34 | const today = moment().format('YYYY-MM-DD') 35 | const year = today.match(/[0-9]{4}/)[0] 36 | found = bankHolidays[year].includes(today) 37 | } catch (e) { 38 | console.error(e) 39 | } 40 | return found 41 | } 42 | 43 | module.exports.bankHolidayChecker = bankHolidayChecker 44 | -------------------------------------------------------------------------------- /.github/workflows/scrape.yml: -------------------------------------------------------------------------------- 1 | name: scrape 2 | 3 | on: 4 | schedule: 5 | - cron: '20 9 * * 1-5' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | scrape-and-publish: 10 | runs-on: ubuntu-latest 11 | if: github.event_name == 'schedule' 12 | 13 | steps: 14 | - uses: actions/checkout@master 15 | - name: Use Node.js 16.x 16 | uses: actions/setup-node@v1 17 | with: 18 | node-version: 16.x 19 | - name: Install project 20 | run: yarn 21 | - name: Run main script 22 | env: 23 | MONGO_PASSWORD: ${{ secrets.MONGO_PASSWORD }} 24 | MONGO_USERNAME: ${{ secrets.MONGO_USERNAME }} 25 | OCR_API_KEY: ${{ secrets.OCR_API_KEY }} 26 | WEBHOOK_URL_PROD: ${{ secrets.WEBHOOK_URL_PROD }} 27 | WEBHOOK_URL_TEST: ${{ secrets.WEBHOOK_URL_TEST }} 28 | FB_USERNAME: ${{ secrets.FB_USERNAME }} 29 | FB_PASSWORD: ${{ secrets.FB_PASSWORD }} 30 | run: node ./src/dailyMenuScraper.js 31 | 32 | scrape-on-debug: 33 | runs-on: ubuntu-latest 34 | if: github.event_name == 'workflow_dispatch' 35 | 36 | steps: 37 | - uses: actions/checkout@master 38 | - name: Use Node.js 16.x 39 | uses: actions/setup-node@v1 40 | with: 41 | node-version: 16.x 42 | - name: Install project 43 | run: yarn 44 | - name: Run main script 45 | env: 46 | MONGO_PASSWORD: ${{ secrets.MONGO_PASSWORD }} 47 | MONGO_USERNAME: ${{ secrets.MONGO_USERNAME }} 48 | OCR_API_KEY: ${{ secrets.OCR_API_KEY }} 49 | WEBHOOK_URL_PROD: ${{ secrets.WEBHOOK_URL_PROD }} 50 | WEBHOOK_URL_TEST: ${{ secrets.WEBHOOK_URL_TEST }} 51 | FB_USERNAME: ${{ secrets.FB_USERNAME }} 52 | FB_PASSWORD: ${{ secrets.FB_PASSWORD }} 53 | run: node ./src/dailyMenuScraper.js --debug 54 | -------------------------------------------------------------------------------- /lib/mongoDbInsertMany.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const MongoClient = require('mongodb').MongoClient 27 | 28 | const mongoUsername = process.env.MONGO_USERNAME 29 | const mongoPassword = process.env.MONGO_PASSWORD 30 | const uri = 31 | 'mongodb+srv://' + 32 | mongoUsername + 33 | ':' + 34 | mongoPassword + 35 | '@mongodailymenu001-gn6yb.gcp.mongodb.net/daily_menu?retryWrites=true&w=majority' 36 | 37 | async function mongoDbInsertMany(contentToInsert) { 38 | let client 39 | try { 40 | client = await MongoClient.connect(uri, { 41 | useNewUrlParser: true, 42 | useUnifiedTopology: true 43 | }) 44 | const db = client.db('daily_menu').collection('daily_menu_data') 45 | await db.insertMany(contentToInsert) 46 | } catch (e) { 47 | console.error(e) 48 | } 49 | client.close() 50 | } 51 | module.exports.mongoDbInsertMany = mongoDbInsertMany 52 | -------------------------------------------------------------------------------- /lib/stringValueCleaner.test.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000) 2 | 3 | const stringValueCleaner = require('./../lib/stringValueCleaner') 4 | 5 | const testStringWithSpaces = 6 | 'Weekly offer: soups , fried chicken sandwitch ,\nDaily menu: pizza with salad goulash , dessert ice cream .' 7 | const testStringToReplace = 'Pénteki heti menü: tökfőzelék sóskával ' 8 | const testStringOCR = ' A mai menü: fostólt sclt 11ts hagvmás krump1.iptrével' 9 | 10 | describe('string value cleaner', () => { 11 | test('should cleanup unnecessary spaces', async () => { 12 | const cleanedString = await stringValueCleaner.stringValueCleaner(testStringWithSpaces, false) 13 | expect(cleanedString).not.toContain(/\s\s+/) 14 | }) 15 | test('should cleanup unnecessary spaces if OCR was enabled', async () => { 16 | const cleanedString = await stringValueCleaner.stringValueCleaner(testStringWithSpaces, true) 17 | expect(cleanedString).not.toContain(/\s\s+/) 18 | }) 19 | test('should remove unneccesary patterns', async () => { 20 | const cleanedString = await stringValueCleaner.stringValueCleaner(testStringToReplace, false) 21 | expect(cleanedString).toBe('tökfőzelék sóskával') 22 | }) 23 | test('should use replacement map on OCR-d string', async () => { 24 | const cleanedString = await stringValueCleaner.stringValueCleaner(testStringOCR, true) 25 | expect(cleanedString).toBe('a mai menü füstölt sült hús hagymás krumplipürével') 26 | }) 27 | test('should return n/a if input is null', async () => { 28 | const cleanedStringNull = await stringValueCleaner.stringValueCleaner(null, false) 29 | expect(cleanedStringNull).toBe('n/a') 30 | }) 31 | test('should remove specific words', async () => { 32 | const cleanedString = await stringValueCleaner.stringValueCleaner('Hétfő csirkepörkölt puding kedd', false) 33 | expect(cleanedString).toBe('csirkepörkölt puding') 34 | }) 35 | test('should remove extra commas', async () => { 36 | const cleanedString = await stringValueCleaner.stringValueCleaner('Hétfő csirkepörkölt,,,,,, puding,, kedd', false) 37 | expect(cleanedString).toBe('csirkepörkölt puding') 38 | }) 39 | }) 40 | -------------------------------------------------------------------------------- /src/restaurantMenuClasses.test.js: -------------------------------------------------------------------------------- 1 | process.argv[3] = '1__2019.12.11.' // force cli argument like in case of --debug date=1__2019.12.11. to cover all conditions 2 | 3 | const RestaurantMenuOutput = require('./restaurantMenuClasses').RestaurantMenuOutput 4 | const RestaurantMenuDb = require('./restaurantMenuClasses').RestaurantMenuDb 5 | 6 | describe('RestaurantMenuOutput class and RestaurantMenuDb class', function() { 7 | test('RestaurantMenuOutput should exist and should create new object', function() { 8 | expect(RestaurantMenuOutput).toBeTruthy() 9 | const testObj = new RestaurantMenuOutput( 10 | 'red', 11 | 'Example Restaurant', 12 | 'www.github.com', 13 | './icon.png', 14 | 'Soup Meat Drink', 15 | '1500', 16 | 'HUF', 17 | 'Ft', 18 | 'Budapest, Heroes sqr. 1.' 19 | ) 20 | expect(testObj.color).toBe('red') 21 | expect(testObj.author_name).toBe('EXAMPLE RESTAURANT') // w upperCase 22 | expect(testObj.author_link).toBe('www.github.com') 23 | expect(testObj.author_icon).toBe('./icon.png') 24 | expect(testObj.fields).toBeTruthy() 25 | expect(testObj.fields[0].title).toContain('Example Restaurant') 26 | expect(testObj.fields[0].value).toBe('Soup Meat Drink') 27 | expect(testObj.fields[0].short).toBe(false) 28 | expect(testObj.fields[1].title).toContain('HUF') 29 | expect(testObj.fields[1].value).toBe('1500Ft') 30 | expect(testObj.fields[1].short).toBe(true) 31 | expect(testObj.fields[2].title).toBeTruthy() 32 | expect(testObj.fields[2].value).toBe('Budapest, Heroes sqr. 1.') 33 | expect(testObj.fields[2].short).toBe(true) 34 | expect(testObj.footer).toBe('scraped by DailyMenu') 35 | expect(testObj.ts).toBeTruthy() 36 | }) 37 | test('RestaurantMenuDb should exist and should create new object', function() { 38 | expect(RestaurantMenuDb).toBeTruthy() 39 | const testDbObj = new RestaurantMenuDb('Example Restaurant', '1500', 'HUF', 'Soup Meat Drink') 40 | expect(testDbObj.timestamp).toBeTruthy() 41 | expect(testDbObj.restaurant).toBe('Example Restaurant') 42 | expect(testDbObj.price).toBe('1500') 43 | expect(testDbObj.currency).toBe('HUF') 44 | expect(testDbObj.menuString).toBe('Soup Meat Drink') 45 | }) 46 | }) 47 | -------------------------------------------------------------------------------- /lib/ocrSpaceApiSimple.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const request = require('request') 27 | 28 | let parsedResult 29 | 30 | async function ocrSpaceApiSimple(options) { 31 | // (I.) promise to return the parsedResult for processing 32 | function ocrRequest() { 33 | return new Promise((resolve, reject) => { 34 | request(options, (error, response, body) => { 35 | try { 36 | if (JSON.parse(body).OCRExitCode < 3) { 37 | resolve(JSON.parse(body).ParsedResults[0]) 38 | } else { 39 | JSON.parse(body).ErrorMessage ? reject(JSON.parse(body).ErrorMessage.map(e => e)) : reject(JSON.parse(body)) 40 | } 41 | } catch (e) { 42 | reject(e) 43 | } 44 | }) 45 | }) 46 | } 47 | 48 | // (II.) 49 | try { 50 | parsedResult = await ocrRequest() 51 | } catch (e) { 52 | console.error(e) 53 | } 54 | // most of the cases you will need "ParsedText" => parsedResult = parsedResult.ParsedText 55 | return parsedResult 56 | } 57 | 58 | module.exports.ocrSpaceApiSimple = ocrSpaceApiSimple 59 | -------------------------------------------------------------------------------- /src/date.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | const moment = require('moment') 29 | const bankHolidayChecker = require('./../lib/bankHolidayChecker').bankHolidayChecker 30 | 31 | const date = { 32 | bankHoliday: bankHolidayChecker(), 33 | today: Number(moment().format('d')), 34 | todayFormatted: moment().format('LLLL'), 35 | todayDotSeparated: moment(moment(), 'YYYY-MM-DD').locale('hu').format('L'), // e.g. 2019.05.17. (default format for Hungarian) 36 | dayNames: ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] 37 | } 38 | 39 | /* 40 | * for debug purposes you can run the main script with a 2nd argument like: 41 | * `yarn start --debug --date=2__2019.12.24.` 42 | * where 2 means: Tuesday (0: Sunday, 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday) 43 | * and 2019.12.14. overrides todayDotSeparated 44 | */ 45 | 46 | if (process.argv[3]) { 47 | date.today = process.argv[3].split('__')[0].match(/[0-9]/).toString() 48 | date.todayDotSeparated = process.argv[3].split('__')[1] 49 | console.log('!!! RUNNING IN DEBUG MODE !!! ', date.todayDotSeparated) 50 | } 51 | 52 | module.exports.date = date 53 | -------------------------------------------------------------------------------- /conf/bankHolidays.json: -------------------------------------------------------------------------------- 1 | { 2 | "2019": [ 3 | "2019-01-01", 4 | "2019-03-15", 5 | "2019-04-19", 6 | "2019-04-22", 7 | "2019-05-01", 8 | "2019-06-10", 9 | "2019-08-19", 10 | "2019-08-20", 11 | "2019-10-23", 12 | "2019-12-24", 13 | "2019-12-25", 14 | "2019-12-26", 15 | "2019-12-27", 16 | "2019-12-30", 17 | "2019-12-31" 18 | ], 19 | "2020": [ 20 | "2020-01-01", 21 | "2020-03-15", 22 | "2020-04-10", 23 | "2020-04-13", 24 | "2020-05-01", 25 | "2020-06-01", 26 | "2020-08-20", 27 | "2020-08-21", 28 | "2020-10-23", 29 | "2020-11-01", 30 | "2020-12-24", 31 | "2020-12-25", 32 | "2020-12-26", 33 | "2020-12-28", 34 | "2020-12-29", 35 | "2020-12-30", 36 | "2020-12-31" 37 | ], 38 | "2021": [ 39 | "2021-01-01", 40 | "2021-03-15", 41 | "2021-04-02", 42 | "2021-04-05", 43 | "2021-05-01", 44 | "2021-05-24", 45 | "2021-08-20", 46 | "2021-10-23", 47 | "2021-11-01", 48 | "2021-12-24", 49 | "2021-12-25", 50 | "2021-12-26", 51 | "2021-12-27", 52 | "2021-12-28", 53 | "2021-12-29", 54 | "2021-12-30", 55 | "2021-12-31" 56 | ], 57 | "2022": [ 58 | "2022-01-01", 59 | "2022-03-15", 60 | "2022-04-15", 61 | "2022-04-18", 62 | "2022-05-01", 63 | "2022-06-06", 64 | "2022-08-20", 65 | "2022-10-23", 66 | "2022-11-01", 67 | "2022-12-24", 68 | "2022-12-25", 69 | "2022-12-26", 70 | "2022-12-27", 71 | "2022-12-28", 72 | "2022-12-29", 73 | "2022-12-30", 74 | "2022-12-31" 75 | ], 76 | "2023": [ 77 | "2023-01-01", 78 | "2023-03-15", 79 | "2023-04-07", 80 | "2023-04-10", 81 | "2023-05-01", 82 | "2023-05-29", 83 | "2023-10-23", 84 | "2023-11-01", 85 | "2023-12-25", 86 | "2023-12-26", 87 | "2023-12-27", 88 | "2023-12-28", 89 | "2023-12-29" 90 | ], 91 | "2024": [ 92 | "2024-03-15", 93 | "2024-03-29", 94 | "2024-03-31", 95 | "2024-05-01", 96 | "2024-05-20", 97 | "2024-08-20", 98 | "2024-10-23", 99 | "2024-11-01", 100 | "2024-12-24", 101 | "2024-12-25", 102 | "2024-12-26" 103 | ], 104 | "2025": [null], 105 | "2026": [null], 106 | "2027": [null], 107 | "2028": [null], 108 | "2029": [null], 109 | "2030": [null] 110 | } 111 | -------------------------------------------------------------------------------- /lib/priceCatcher.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | /* 27 | * @param {string} textContent: selector for the whole text 28 | * @param {index} iKnowBetter: optional override index if I don't want the very first price to be displayed 29 | * @return {string} price: the preferred displayed price 30 | * @return {string} priceCurrencyStr: the currency displayed with the price in the output 31 | * @return {string} priceCurrency: currency for database 32 | */ 33 | 34 | let price 35 | let priceCurrencyStr 36 | let priceCurrency 37 | 38 | function priceCatcher(textContent, iKnowBetter = 0) { 39 | price = 'n/a' 40 | priceCurrencyStr = '' 41 | priceCurrency = 'n/a' 42 | if (textContent !== undefined) { 43 | price = textContent.match(/(([0-9]{1}|[0-9]{2}|[0-9]{3}|[0-9](\.|\s|,|)[0-9]{3})(,|))(|.|..|...)(ft|huf)/gim) 44 | if (price !== null) { 45 | price = price[iKnowBetter] 46 | price = price.replace(/,[0-9][0-9][^\d]/g, '') 47 | price = price.replace(/[^0-9]/g, '') 48 | priceCurrencyStr = ' Ft' 49 | priceCurrency = 'HUF' 50 | } else { 51 | price = 'n/a' 52 | priceCurrencyStr = '' 53 | priceCurrency = 'n/a' 54 | } 55 | } 56 | return { price, priceCurrencyStr, priceCurrency } 57 | } 58 | module.exports.priceCatcher = priceCatcher 59 | -------------------------------------------------------------------------------- /lib/stringValueCleaner.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const removalMap = require('./../conf/removalMap.json') 27 | const replacementMap = require('./../conf/replacementMap.json') 28 | 29 | /* 30 | * @param {string} stringValue - the raw string we work with, object/array will throw TypeError 31 | * @param {boolean} replacementNeeded - boolean if the string is coming via OCR (or has some noise) or not 32 | * @return {string} cleanedStringValue 33 | */ 34 | 35 | // todo: this is actually a sync function! please address it 36 | async function stringValueCleaner(stringValue, replacementNeeded) { 37 | let cleanedStringValue 38 | // remove unneccesary amount of spaces and strings 39 | if (stringValue !== null) { 40 | stringValue = stringValue.toString().replace(new RegExp(removalMap.remove, 'gi'), '') 41 | // format text and replace faulty string parts remained after OCR 42 | if (replacementNeeded === true) { 43 | for (let rule in replacementMap.rules) { 44 | stringValue = await stringValue.toLowerCase().replace(new RegExp(replacementMap.rules[rule], 'gi'), rule) 45 | } 46 | } 47 | cleanedStringValue = stringValue.replace(/\s+/g, ' ').replace(/\s,/g, ',').trim() 48 | } else { 49 | cleanedStringValue = 'n/a' 50 | } 51 | return cleanedStringValue 52 | } 53 | module.exports.stringValueCleaner = stringValueCleaner 54 | -------------------------------------------------------------------------------- /src/restaurantMenuClasses.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | const date = require('./../src/date').date 29 | 30 | // class for menu object 31 | class RestaurantMenuOutput { 32 | constructor(color, title, url, icon, value, price, currency, priceCurrency, address) { 33 | this.color = color 34 | this.author_name = title.toUpperCase() 35 | this.author_link = url 36 | this.author_icon = icon 37 | this.fields = [ 38 | { 39 | title: title + ' menu (' + date.dayNames[date.today] + '):', 40 | value: value, 41 | short: false 42 | }, 43 | { 44 | title: 'price (' + currency + ')', 45 | value: price + priceCurrency, 46 | short: true 47 | }, 48 | { 49 | title: 'address', 50 | value: address, 51 | short: true 52 | } 53 | ] 54 | this.footer = 'scraped by DailyMenu' 55 | this.ts = Math.floor(Date.now() / 1000) 56 | } 57 | } 58 | 59 | // class for database object 60 | class RestaurantMenuDb { 61 | constructor(titleString, priceString, priceCurrency, valueString) { 62 | this.timestamp = date.todayDotSeparated 63 | this.restaurant = titleString 64 | this.price = priceString 65 | this.currency = priceCurrency 66 | this.menuString = valueString 67 | } 68 | } 69 | 70 | module.exports = { RestaurantMenuOutput, RestaurantMenuDb } 71 | -------------------------------------------------------------------------------- /scrapers/pestiDiszno.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const ocrFacebookImage = require('./ocrFacebookImage') 27 | 28 | async function scraper() { 29 | /* 30 | * @ PESTI DISZNO 31 | * --------------------------------------- 32 | * contact info: 33 | * Budapest, Nagymező u. 19, 1063 34 | * Phone: +36 (1) 951 4061 35 | * --------------------------------------- 36 | * description: 37 | * this daily menu relies on if a menu (recognizable for OCR) is available among timeline photos 38 | */ 39 | 40 | // @ PESTI DISZNO parameters 41 | const color = '#000000' 42 | const titleString = 'Pesti Diszno' 43 | const url = 'https://www.facebook.com/pg/PestiDiszno/posts/' 44 | const icon = 'http://www.pestidiszno.hu/img/pdlogob2.png' 45 | const addressString = 'Budapest, Nagymező u. 19, 1063' 46 | const daysRegexArray = [null, /[^%]*/g, /[^%]*/g, /[^%]*/g, /[^%]*/g, /[^%]*/g] 47 | const facebookImageUrlSelector = 'img[class^="scaledImageFit"]' 48 | const menuHandleRegex = /fogás/gi 49 | const startLine = 3 50 | const endLine = 17 51 | const zoomIn = false 52 | 53 | await ocrFacebookImage.ocrFacebookImage( 54 | color, 55 | titleString, 56 | url, 57 | icon, 58 | addressString, 59 | daysRegexArray, 60 | facebookImageUrlSelector, 61 | menuHandleRegex, 62 | startLine, 63 | endLine, 64 | zoomIn 65 | ) 66 | } 67 | module.exports.scraper = scraper 68 | -------------------------------------------------------------------------------- /scrapers/kata.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const ocrFacebookImage = require('./ocrFacebookImage') 27 | 28 | async function scraper() { 29 | /* 30 | * @ KATA 31 | * --------------------------------------- 32 | * contact info: 33 | * Address: Budapest, 1065, Hajós u. 27. 34 | * Phone: +36(1) 302 4614 35 | * --------------------------------------- 36 | * description: 37 | * this daily menu relies on if a menu (recognizable for OCR) is available among timeline photos 38 | */ 39 | 40 | // @ KATA parameters 41 | const color = '#3C5A99' 42 | const titleString = 'Kata (Chagall)' 43 | const url = 'https://m.facebook.com/katarestaurantbudapest/posts/?ref=page_internal&locale=hu_HU&_rdr' 44 | const icon = 45 | 'https://lh3.googleusercontent.com/GrM72gaBN1l7BUgUuWI5T9w2zc1qxsKFNukg6Szp-lXXpfG0wmnxT2FA_o725nmAiZkxGmf_=w1080-h608-p-no-v0' 46 | const addressString = 'Budapest, 1065, Hajós u. 27.' 47 | const daysRegexArray = [ 48 | null, 49 | /(\WHÉTF.*\WKEDD\W)|(\WHÉTF.{60})/gi, 50 | /(\WKEDD.*\WSZERDA\W)|(\WKEDD.{60})/gi, 51 | /(\WSZERDA.*\WCSÜTÖRTÖK\W)|(\WSZERDA.{60})/gi, 52 | /(\WCSÜTÖRTÖK.*\WPÉNTEK\W)|(\WCSÜTÖRTÖK.{60})/gi, 53 | /(\WPÉNTEK.*\WA.SZÁMLA)|(\WPÉNTEK.{60})/gi 54 | ] 55 | const menuHandleRegex = /heti/gi 56 | 57 | await ocrFacebookImage.ocrFacebookImage( 58 | color, 59 | titleString, 60 | url, 61 | icon, 62 | addressString, 63 | daysRegexArray, 64 | menuHandleRegex 65 | ) 66 | } 67 | module.exports.scraper = scraper 68 | -------------------------------------------------------------------------------- /src/server.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | const express = require('express') 29 | const cors = require('cors') 30 | const mongoDbSearch = require('./../lib/mongoDbSearch').mongoDbSearch 31 | 32 | function endpointCreation() { 33 | try { 34 | const app = express() 35 | app.use(cors()) 36 | const port = process.env.PORT || 5000 37 | 38 | app.get('/api/1/daily-menu/', async (req, res) => { 39 | try { 40 | const results = await mongoDbSearch() 41 | results[0] 42 | ? res.json(results) 43 | : res.status(500).json({ error: 'something must be wrong on our side, come back later!' }) 44 | console.log('/api/1/daily-menu/ endpoint has been called!') 45 | } catch (e) { 46 | console.error(e) 47 | } 48 | }) 49 | 50 | app.get('/api/1/daily-menu/:date', async (req, res) => { 51 | try { 52 | let date 53 | req.params.date.match(/[1-2][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]/) ? (date = req.params.date) : (date = null) 54 | 55 | const results = await mongoDbSearch(date) 56 | results[0] ? res.json(results) : res.status(404).json({ error: 'no menu for the selected date!' }) 57 | console.log(`/api/1/daily-menu/${date} endpoint has been called!`) 58 | } catch (e) { 59 | console.error(e) 60 | } 61 | }) 62 | 63 | app.listen(port) 64 | 65 | console.log(`API is listening on ${port}\nendpoint is available at: /api/1/daily-menu/`) 66 | } catch (e) { 67 | console.error(e) 68 | } 69 | } 70 | endpointCreation() 71 | -------------------------------------------------------------------------------- /scrapers/drop.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const ocrFacebookImage = require('./ocrFacebookImage') 27 | 28 | async function scraper() { 29 | /* 30 | * @ DROP 31 | * --------------------------------------- 32 | * contact info: 33 | * Address: Budapest, 1065, Hajós u. 27. 34 | * Phone: +36 1 235 0468 35 | * --------------------------------------- 36 | * description: 37 | * this daily menu relies on if a menu (recognizable for OCR) is available among timeline photos 38 | */ 39 | 40 | // @ DROP parameters 41 | const color = '#d3cd78' 42 | const titleString = 'Drop Restaurant' 43 | const url = 'https://www.facebook.com/pg/droprestaurant/posts/?ref=page_internal' 44 | const icon = 'http://droprestaurant.com/public/wp-content/uploads/2015/07/logo-header.png' 45 | const addressString = 'Budapest, 1065, Hajós u. 27' 46 | const daysRegexArray = [ 47 | null, 48 | /\bHÉT((.*\r?\n){4})/gi, 49 | /\bKEDD((.*\r?\n){4})/gi, 50 | /\bSZERD((.*\r?\n){4})/gi, 51 | /(\bCSÜT|\bCSIIT|\bCSUT)((.*\r?\n){4})/gi, 52 | /\bPÉNT((.*\r?\n){3})/gi 53 | ] 54 | const facebookImageUrlSelector = '.scaledImageFitWidth' 55 | const menuHandleRegex = /Szerda/gi 56 | const startLine = 0 57 | const endLine = 7 58 | const zoomIn = true 59 | 60 | await ocrFacebookImage.ocrFacebookImage( 61 | color, 62 | titleString, 63 | url, 64 | icon, 65 | addressString, 66 | daysRegexArray, 67 | facebookImageUrlSelector, 68 | menuHandleRegex, 69 | startLine, 70 | endLine, 71 | zoomIn 72 | ) 73 | } 74 | module.exports.scraper = scraper 75 | -------------------------------------------------------------------------------- /scrapers/incognito.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const ocrFacebookImage = require('./ocrFacebookImage') 27 | 28 | async function scraper() { 29 | /* 30 | * @ INCOGNITO 31 | * --------------------------------------- 32 | * contact info: 33 | * Address: Budapest, Liszt tér 34 | * --------------------------------------- 35 | * description: 36 | * this daily menu relies on if a menu (recognizable for OCR) is available among timeline photos 37 | */ 38 | 39 | // @ INCOGNITO parameters 40 | const color = '#cc2c2c' 41 | const titleString = 'Incognito' 42 | const url = 'https://www.facebook.com/pg/cafeincognito/posts/' 43 | const icon = 'https://www.nicepng.com/png/detail/141-1415218_incognito-logo-incognito-mode-icon.png' 44 | const addressString = 'Budapest, Liszt tér' 45 | const daysRegexArray = [ 46 | null, 47 | /\bHÉT((.*\r?\n){3})/gi, 48 | /\bKED((.*\r?\n){3})/gi, 49 | /\bSZERD((.*\r?\n){3})/gi, 50 | /\bCSOT((.*\r?\n){3})|\bCSU((.*\r?\n){3})|\bCSÜ((.*\r?\n){3})|\bCsiitörtök((.*\r?\n){3})|törtök((.*\r?\n){3})/gi, 51 | /\bPÉNT((.*\r?\n){3})/gi 52 | ] 53 | const facebookImageUrlSelector = '.scaledImageFitHeight' 54 | const menuHandleRegex = /HETI MENÜ/gi 55 | const startLine = 1 56 | const endLine = 2 57 | const zoomIn = false 58 | 59 | await ocrFacebookImage.ocrFacebookImage( 60 | color, 61 | titleString, 62 | url, 63 | icon, 64 | addressString, 65 | daysRegexArray, 66 | facebookImageUrlSelector, 67 | menuHandleRegex, 68 | startLine, 69 | endLine, 70 | zoomIn 71 | ) 72 | } 73 | module.exports.scraper = scraper 74 | -------------------------------------------------------------------------------- /lib/dateCatcher.test.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000) 2 | 3 | const dateCatcher = require('./../lib/dateCatcher') 4 | 5 | jest.mock('./../src/date', () => ({ date: { todayDotSeparated: '2019.06.19.' } })) 6 | 7 | const mockedContent = [ 8 | 'Menu for 2019. 06. 19. is the following...', 9 | 'Menu for 2019. 06. 10. is the following...', 10 | 'Menu for 2019-06-19 is the following...', 11 | 'Menu between 2019. június 17. and 21. is...', 12 | 'Menu between 2019. június 10. and 14. is...', 13 | 'Menu between 2019.06.17. - 2019.06.21. is...', 14 | 'Menu for 2019. 06.19. is the following...', 15 | '2019. 06.19.  - HÉTFÕKhao Soi levesSushi: Kani 2 nagy maki tekercsCsípõs pirított tészta pálcikás rákkalSzójababcsíra salátaBrownie2019. 06.20. - KEDDMiso levesSushi: azac quinoa donPad thai rizs csirkévelEdamame babMatchateás sajttorta2019. 06.21. - SZERDAThai curry leves rákkalSushi: Akasaka nagy maki tekercs 2dbBulgogi rizzselFügés kecskesajt salátaMandulás túrógolyó2019. 06.22. - CSÜTÖRTÖKPho Bo 2019-06-22 .' 16 | ] 17 | let found 18 | 19 | describe('date catcher', function() { 20 | test('should catch 2019. 06. 19. (dot separated)', async function() { 21 | found = await dateCatcher.dateCatcher(mockedContent[0]) 22 | expect(found).toBe(true) 23 | }) 24 | test('should not catch 2019. 06. 10. (dot separated)', async function() { 25 | found = await dateCatcher.dateCatcher(mockedContent[1]) 26 | expect(found).toBe(false) 27 | }) 28 | test('should catch 2019-06-19 (dash separated)', async function() { 29 | found = await dateCatcher.dateCatcher(mockedContent[2]) 30 | expect(found).toBe(true) 31 | }) 32 | test('should throw error if promise rejects', async function() { 33 | found = await dateCatcher.dateCatcher(undefined).rejects 34 | expect(found).toBeUndefined() 35 | }) 36 | test('should match interval if it is on the same week', async function() { 37 | found = await dateCatcher.dateCatcher(mockedContent[3], true) 38 | expect(found).toBe(true) 39 | }) 40 | test('should not match interval if it is from previous week', async function() { 41 | found = await dateCatcher.dateCatcher(mockedContent[4], true) 42 | expect(found).toBe(false) 43 | }) 44 | test('should match interval if it is on the same week (dot separated)', async function() { 45 | found = await dateCatcher.dateCatcher(mockedContent[5], true) 46 | expect(found).toBe(true) 47 | }) 48 | test('should catch 2019. 06.19. (dot separated, mixed spacing)', async function() { 49 | found = await dateCatcher.dateCatcher(mockedContent[6]) 50 | expect(found).toBe(true) 51 | }) 52 | test('should catch 2019. 06.19. (dot separated, mixed spacing) in a longer text with multiple other dates', async function() { 53 | found = await dateCatcher.dateCatcher(mockedContent[7]) 54 | expect(found).toBe(true) 55 | }) 56 | }) 57 | -------------------------------------------------------------------------------- /lib/priceCompareToDb.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | /* 27 | * @param {string} contentToFind: exact name of the restauirant 28 | * @param {string} currentPrice: the returned value of priceCatcher.js 29 | * @return {string} trend: ▲ OR ▼ OR '' (empty string) 30 | */ 31 | 32 | 'use strict' 33 | 34 | const MongoClient = require('mongodb').MongoClient 35 | 36 | const mongoUsername = process.env.MONGO_USERNAME 37 | const mongoPassword = process.env.MONGO_PASSWORD 38 | const uri = 39 | 'mongodb+srv://' + 40 | mongoUsername + 41 | ':' + 42 | mongoPassword + 43 | '@mongodailymenu001-gn6yb.gcp.mongodb.net/daily_menu?retryWrites=true&w=majority' 44 | 45 | async function priceCompareToDb(contentToFind, currentPrice) { 46 | let client 47 | let res 48 | let prevPrice 49 | let trend 50 | try { 51 | client = await MongoClient.connect(uri, { 52 | useNewUrlParser: true, 53 | useUnifiedTopology: true 54 | }) 55 | 56 | const db = client.db('daily_menu').collection('daily_menu_data') 57 | res = await db 58 | .find({ restaurant: contentToFind }) 59 | .sort({ _id: -1 }) 60 | .limit(1) 61 | .toArray() 62 | client.close() 63 | } catch (e) { 64 | console.error(e) 65 | } 66 | try { 67 | prevPrice = res[0] ? res[0].price : 0 68 | prevPrice = parseInt(prevPrice, 10) 69 | currentPrice = parseInt(currentPrice, 10) 70 | if (prevPrice < currentPrice) { 71 | trend = '▲' 72 | } else if (prevPrice > currentPrice) { 73 | trend = '▼' 74 | } else { 75 | trend = '' 76 | } 77 | } catch (e) { 78 | console.error(e) 79 | } 80 | return trend 81 | } 82 | 83 | module.exports.priceCompareToDb = priceCompareToDb 84 | -------------------------------------------------------------------------------- /lib/mongoDbSearch.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | /* 27 | * @param {string} date: in dash separated format like 2020-01-11 28 | * @returns {object}: the menu content of a specific day, either selected date (from param) or latest valid date 29 | */ 30 | 31 | 'use strict' 32 | 33 | const MongoClient = require('mongodb').MongoClient 34 | const moment = require('moment') 35 | 36 | const mongoUsername = process.env.MONGO_USERNAME 37 | const mongoPassword = process.env.MONGO_PASSWORD 38 | const uri = 39 | 'mongodb+srv://' + 40 | mongoUsername + 41 | ':' + 42 | mongoPassword + 43 | '@mongodailymenu001-gn6yb.gcp.mongodb.net/daily_menu?retryWrites=true&w=majority' 44 | 45 | async function mongoDbSearch(date) { 46 | let client 47 | let res 48 | const dateFormatted = moment(date, 'YYYY-MM-DD') 49 | .locale('hu') 50 | .format('L') 51 | 52 | try { 53 | client = await MongoClient.connect(uri, { 54 | useNewUrlParser: true, 55 | useUnifiedTopology: true 56 | }) 57 | 58 | const db = client.db('daily_menu').collection('daily_menu_data') 59 | if (date) { 60 | res = await db.find({ timestamp: dateFormatted }).toArray() 61 | } else { 62 | let latestValidTimestamp = await db 63 | .aggregate([ 64 | { $sort: { timestamp: 1, restaurant: 1 } }, 65 | { 66 | $group: { 67 | _id: '$restaurant', 68 | lastTimestamp: { $last: '$timestamp' } 69 | } 70 | } 71 | ]) 72 | .limit(1) 73 | .toArray() 74 | latestValidTimestamp = await latestValidTimestamp[0].lastTimestamp 75 | res = await db.find({ timestamp: latestValidTimestamp }).toArray() 76 | } 77 | 78 | client.close() 79 | } catch (e) { 80 | console.error(e) 81 | } 82 | return res 83 | } 84 | 85 | module.exports.mongoDbSearch = mongoDbSearch 86 | -------------------------------------------------------------------------------- /lib/priceCatcher.test.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000) 2 | 3 | const priceCatcher = require('./../lib/priceCatcher') 4 | const mockedContent = [ 5 | 'Menu price is 1,200 ft today', 6 | 'Menu price is 1.200Ft today', 7 | 'Menu price is 1200 ft today', 8 | 'Menu price is 1200,- ft today', 9 | 'Menu price is 1 200 HUF today', 10 | 'Menu price is 1\xa0200,00 Ft today', 11 | 'Menu price is one thousand two hundred HUF today', 12 | 'Menu prices are: 1000 Ft (soup) and 1200 Ft (main dish) today' 13 | ] 14 | 15 | describe('price catcher', function() { 16 | test('should catch 1,200 ft (comma as thousands separator)', function() { 17 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[0]) 18 | expect(price).toBe('1200') 19 | expect(price + priceCurrencyStr).toBe('1200 Ft') 20 | expect(priceCurrency).toBe('HUF') 21 | }) 22 | test('should catch 1.200Ft (dot as thousands separator + no space between currency)', function() { 23 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[1]) 24 | expect(price).toBe('1200') 25 | expect(price + priceCurrencyStr).toBe('1200 Ft') 26 | expect(priceCurrency).toBe('HUF') 27 | }) 28 | test('should catch 1200 ft (without thousands separator)', function() { 29 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[2]) 30 | expect(price).toBe('1200') 31 | expect(price + priceCurrencyStr).toBe('1200 Ft') 32 | expect(priceCurrency).toBe('HUF') 33 | }) 34 | test('should catch 1200,- ft (with local currency delimiter)', function() { 35 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[3]) 36 | expect(price).toBe('1200') 37 | expect(price + priceCurrencyStr).toBe('1200 Ft') 38 | expect(priceCurrency).toBe('HUF') 39 | }) 40 | test('should catch 1 200 HUF (space + international currency)', function() { 41 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[4]) 42 | expect(price).toBe('1200') 43 | expect(price + priceCurrencyStr).toBe('1200 Ft') 44 | expect(priceCurrency).toBe('HUF') 45 | }) 46 | test('should catch 1\xa0200,00 Ft (non-breaking space + decimal separator)', function() { 47 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[5]) 48 | expect(price).toBe('1200') 49 | expect(price + priceCurrencyStr).toBe('1200 Ft') 50 | expect(priceCurrency).toBe('HUF') 51 | }) 52 | test('should not catch one thousand two hundred HUF (text price)', function() { 53 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[6]) 54 | expect(price).toBe('n/a') 55 | expect(price + priceCurrencyStr).toBe('n/a') 56 | expect(priceCurrency).toBe('n/a') 57 | }) 58 | test('should catch the 2nd price from the string', function() { 59 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[7], 1) 60 | expect(price).toBe('1200') 61 | expect(price + priceCurrencyStr).toBe('1200 Ft') 62 | expect(priceCurrency).toBe('HUF') 63 | }) 64 | test('should not break function if input is empty (undefined)', function() { 65 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(mockedContent[8]) // there is no such element of the object : undefined 66 | expect(price).toBe('n/a') 67 | expect(price + priceCurrencyStr).toBe('n/a') 68 | expect(priceCurrency).toBe('n/a') 69 | }) 70 | }) 71 | -------------------------------------------------------------------------------- /lib/ocrSpaceApiSimple.test.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000) 2 | 3 | const nock = require('nock') 4 | const ocrSpaceApiSimple = require('./../lib/ocrSpaceApiSimple') 5 | const ocrSpaceApiSimpleMock = require('./ocrSpaceApiSimple.mock.json') 6 | 7 | nock('https://api.ocr.space') 8 | .post('/parse/image') 9 | .reply(200, ocrSpaceApiSimpleMock) 10 | 11 | // a GitHub text jpg in base64 format 12 | const imageAsBase64 = 13 | 'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAP//////////////////////////////////////////////////////////////////////////////////////2wBDAf//////////////////////////////////////////////////////////////////////////////////////wgARCAA+AN8DAREAAhEBAxEB/8QAFwABAQEBAAAAAAAAAAAAAAAAAgEAA//EABQBAQAAAAAAAAAAAAAAAAAAAAD/2gAMAwEAAhADEAAAAUEhjCMYpjGCYRCFKYxjBKUxiAKUwgGGARQGGQAymMYwCiMYJBmMYhCgKIJhkAMxihKExTCAYYDFIYoSlIQxjDAIoBhMYgwmGEIiGGcxCAYRAjAIoBhKIBTBGU5iIYZzEIBhkAMAigGEogFKEhSCIYYDFIYZADCYxBhMYgykIYpSGKQhTGKYhjEKYxjEKU//xAAUEAEAAAAAAAAAAAAAAAAAAABw/9oACAEBAAEFAmD/xAAUEQEAAAAAAAAAAAAAAAAAAABw/9oACAEDAQE/AWD/xAAUEQEAAAAAAAAAAAAAAAAAAABw/9oACAECAQE/AWD/xAAUEAEAAAAAAAAAAAAAAAAAAABw/9oACAEBAAY/AmD/xAAaEAADAQEBAQAAAAAAAAAAAAAAARARIDEw/9oACAEBAAE/ITTZs03nTbveifDmGGRimijFHF2uH8kaMUcXht2o02MU00QxDqHyvIqrkQxTIhiqjmGReRVRVD5QxVRiji8iqiq6QxRxDFHFdiuxXDDJhlwwwy5cMMmGGGT/2gAMAwEAAgADAAAAEBIJIAAJBBAAAAABAAIBAJBIAAAAAAAABJJJBIBBIJIJBJJBBJJIJJABIJAIJBIIBIJJAAABIIJBIJBIJJAAAIIIJABJBIABABIBJAABIAABJAIBAP/EABQRAQAAAAAAAAAAAAAAAAAAAHD/2gAIAQMBAT8QYP/EABQRAQAAAAAAAAAAAAAAAAAAAHD/2gAIAQIBAT8QYP/EAB8QAQACAgEFAQAAAAAAAAAAAAEAMRARcSAhQVGBYf/aAAgBAQABPxCOlTb3Nvtm0HYMQQD0qHUEusLoWb+iDs31L3U2dC0c4EznN8XljG0S73i8sYo4odTbzLvReBtDpUOybglBqKF4QbgGXljFHFE1/YI41/YO+8beYg3ucYF/MWZbG03iXe5f5LECtYBahTeyX6aMecaeMV+xt5gLU2MPZxLMsY3948/kv8ljDc88XgbdTX9iAXFGPONPGK/Y28zzw28zzhpwI4bZ5/Jf5LGG554vLGKOKMecaeMV+xt5nnht5nnE2aiJeNvtx5y/yWMDSwUqJd7l5YxRxQiaWClRT5xX7G3mClTb3gaOcINzWc4HnCGAO+EG5r7gCIe8Ad8JvtNf2BrtEG5zgeYhga7TSaa1NfcAfuP/2Q==' 14 | 15 | const options = { 16 | method: 'POST', 17 | url: 'https://api.ocr.space/parse/image', 18 | headers: { 19 | apikey: process.env.OCR_API_KEY || 'helloworld' 20 | }, 21 | formData: { 22 | language: 'hun', 23 | isOverlayRequired: 'true', 24 | base64image: imageAsBase64, 25 | scale: 'true', 26 | isTable: 'true', 27 | OCREngine: 1 28 | } 29 | } 30 | const optionsError = {} 31 | let parsedResult 32 | 33 | describe('OCR Space Api', function() { 34 | test('should respond with a valid parsedResult', async function() { 35 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 36 | parsedResult = parsedResult.ParsedText 37 | expect(parsedResult).toMatch(/GitHub/gi) 38 | }) 39 | test('should throw error if promise rejects', async function() { 40 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(optionsError).rejects 41 | expect(parsedResult).toBeUndefined() 42 | }) 43 | test('should throw error if image is empty', async function() { 44 | ocrSpaceApiSimpleMock.OCRExitCode = 6 45 | nock('https://api.ocr.space') 46 | .post('/parse/image') 47 | .reply(200, ocrSpaceApiSimpleMock) 48 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 49 | expect(parsedResult).toBeTruthy() 50 | }) 51 | test('should throw error if not standard error thrown on 3rd party side', async function() { 52 | nock('https://api.ocr.space') 53 | .post('/parse/image') 54 | .reply( 55 | 200, 56 | 'For this API KEY only 3 concurrent connections at the same time allowed. Contact support if you need more.' 57 | ) 58 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 59 | expect(parsedResult).toBeTruthy() 60 | }) 61 | }) 62 | -------------------------------------------------------------------------------- /lib/dateCatcher.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const moment = require('moment') 27 | const todayDotSeparated = require('./../src/date').date.todayDotSeparated 28 | 29 | /* 30 | * @param {boolean} interval: optional argument defaults to false 31 | * @param {string} textContent: selector for the whole text 32 | * @param {string} theWhole: textContent's string 33 | * @return {boolean} found 34 | */ 35 | 36 | let found 37 | let actualDateStrings 38 | 39 | async function dateCatcher(textContent, interval = false) { 40 | try { 41 | found = false 42 | let theWhole = textContent 43 | moment.locale('hu') 44 | 45 | // checks if a parsed date is the same week as the current one (for weekly menus) 46 | if (interval === true) { 47 | const monthsWithStrings = 48 | '(január|február|március|április|május' + 49 | '|június|július|augusztus|szeptember' + 50 | '|október|november|december).([1-3][0-9]|[1-9])(\\.|)' 51 | // ternary decides if date contains month names with letters or dot separated numbers, in 2nd case: it formats it to similar format as 1st 52 | theWhole.match(new RegExp(monthsWithStrings, 'i')) 53 | ? theWhole 54 | : (theWhole = moment(theWhole.match(/[0-9]{4}\.[0-9]{2}\.[0-9]{2}\./), 'YYYY-MM-DD').format('MMMM D')) 55 | actualDateStrings = theWhole.match(new RegExp(monthsWithStrings, 'i')) 56 | 57 | // date components 58 | const year = moment(todayDotSeparated, 'YYYY-MM-DD').format('YYYY') 59 | const month = actualDateStrings[1] // e.g. July => július 60 | const day = actualDateStrings[2] 61 | 62 | let parsedDate = moment([year, 0, day]).month(month).locale('hu').format('YYYY MMMM DD') 63 | parsedDate = moment(parsedDate, 'YYYY MMMM DD').format('w') 64 | let currentWeek = moment(todayDotSeparated, 'YYYY-MM-DD').format('w') 65 | if (currentWeek.match(parsedDate)) { 66 | found = true 67 | } 68 | 69 | // get date from menu (for daily menus listed by dates) 70 | } else { 71 | let actualDateStrings = theWhole.match(/[12]\d{3}(.|\..)(0[1-9]|1[0-2])(.|\..)(0[1-9]|[12]\d|3[01])/gm) 72 | forlabel: for (let i = 0; i < actualDateStrings.length; i++) { 73 | actualDateStrings[i] = moment(actualDateStrings[i], 'YYYY-MM-DD').format('L') 74 | if (todayDotSeparated.match(actualDateStrings[i])) { 75 | found = true 76 | break forlabel 77 | } 78 | } 79 | } 80 | } catch (e) { 81 | console.error(e) 82 | } 83 | return found 84 | } 85 | module.exports.dateCatcher = dateCatcher 86 | -------------------------------------------------------------------------------- /src/dailyMenuScraper.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | 'use strict' 27 | 28 | const puppeteer = require('puppeteer') 29 | const request = require('request') 30 | const mongoDbInsertMany = require('./../lib/mongoDbInsertMany').mongoDbInsertMany 31 | const activeRequiredScrapers = require('./../conf/requiredScrapers.json').scrapers.active 32 | const date = require('./date').date 33 | 34 | let webhookEnv = null 35 | 36 | process.argv[2] === '--debug' ? (webhookEnv = process.env.WEBHOOK_URL_TEST) : (webhookEnv = process.env.WEBHOOK_URL_PROD) 37 | 38 | date.bankHoliday ? process.exit(0) : console.log('not bank holiday') 39 | console.log('*' + date.dayNames[date.today].toUpperCase() + '*\n' + '='.repeat(date.dayNames[date.today].length)) 40 | 41 | // these will be the objects we extend (its 'attachments') with each daily menu and for MongoDB as well 42 | let finalJSON = { 43 | text: '*' + date.dayNames[date.today].toUpperCase() + '* ' + date.todayFormatted + '\n', 44 | attachments: [] 45 | } 46 | let finalMongoJSON = [] 47 | 48 | // scraper browser instance - function that wraps all the scrapers 49 | async function scrapeMenu() { 50 | const browser = await puppeteer.launch({ headless: true, defaultViewport: null, args: ['--start-maximized'] }) 51 | const browserWSEndpoint = await browser.wsEndpoint() 52 | 53 | // used outside of main script in the scrapers 54 | module.exports = { finalJSON, finalMongoJSON, browserWSEndpoint } 55 | 56 | // require scrapers after module.exports object is declared and launch the active ones, see: ./conf/requiredScrapers.json 57 | async function scraperExecuter() { 58 | for (const scraper of activeRequiredScrapers) { 59 | const actual = require(`./../scrapers/${scraper}`) 60 | try { 61 | await actual.scraper() 62 | } catch (e) { 63 | console.error(e) 64 | } 65 | } 66 | } 67 | await scraperExecuter() 68 | 69 | finalJSON = JSON.stringify(finalJSON) 70 | console.log(finalJSON) 71 | 72 | // the final countdown (before post the actual menu to webhooks) 73 | console.log('\nWARNING: the output will be posted to slack in 5 seconds!') 74 | setTimeout(() => { 75 | console.log('POST') 76 | }, 5000) 77 | 78 | // _POST the final JSON to webhook 79 | request( 80 | { 81 | url: webhookEnv, 82 | method: 'POST', 83 | json: false, 84 | body: finalJSON 85 | }, 86 | function (error, response, body) { 87 | if (error) { 88 | console.error(error) 89 | } 90 | } 91 | ) 92 | 93 | // store the data to MongoDB 94 | try { 95 | await mongoDbInsertMany(finalMongoJSON) 96 | } catch (e) { 97 | console.error(e) 98 | } 99 | await browser.close() 100 | } 101 | scrapeMenu() 102 | -------------------------------------------------------------------------------- /scrapers/roza.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const stringValueCleaner = require('./../lib/stringValueCleaner') 30 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 31 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 32 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 33 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 34 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 35 | 36 | async function scraper() { 37 | const browser = await puppeteer.connect({ browserWSEndpoint }) 38 | const page = await browser.newPage() 39 | 40 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 41 | await page.setRequestInterception(true) 42 | page.on('request', request => { 43 | if (request.resourceType() === 'image') { 44 | request.abort() 45 | } else { 46 | request.continue() 47 | } 48 | }) 49 | 50 | /* 51 | * @ ROZA 52 | * ------------------------------------------ 53 | * contact info: 54 | * Address: Budapest, Jókai u. 22, 1066 55 | * Phone: (30) 611 4396 56 | * ----------------------------------------- 57 | */ 58 | 59 | // @ ROZA parameters 60 | let paramColor = '#fced4e' 61 | let paramTitleString = 'Róza Soup Restaurant' 62 | let paramUrl = 'https://www.facebook.com/pg/rozafinomitt/posts/' 63 | let paramIcon = 64 | 'https://scontent.fbud5-1.fna.fbcdn.net/v/t1.0-1/10394619_390942531075147_2725477335166513345_n.jpg?_nc_cat=108&_nc_oc=AQmYePlHDUuQq8mobFYahU1UY5c-BqLoTnXZcMZ6PhYThgnyFqkGNqZWmsHOwzUEwZM&_nc_ht=scontent.fbud5-1.fna&oh=05bb8d72ba040dc6dbe894a50587fcc3&oe=5E3DA8B6' 65 | let paramAddressString = 'Budapest, Jókai u. 22, 1066' 66 | let paramValueString 67 | let paramPriceString 68 | let paramPriceCurrency 69 | let paramPriceCurrencyString 70 | let dailyRoza 71 | let obj = null 72 | let mongoObj = null 73 | 74 | // @ ROZA selector 75 | const dailyRozaSelector = '.userContent' 76 | 77 | try { 78 | await page.goto(paramUrl, { waitUntil: 'domcontentloaded' }) 79 | // @ ROZA Daily 80 | dailyRoza = await page.evaluate(el => el.textContent, (await page.$$(dailyRozaSelector))[0]) 81 | // @ ROZA price catch 82 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(dailyRoza) 83 | paramPriceString = price 84 | paramPriceCurrency = priceCurrency 85 | paramPriceCurrencyString = priceCurrencyStr 86 | 87 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(dailyRoza, true)) // @ ROZA clean string 88 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 89 | console.log(paramValueString) 90 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 91 | // @ ROZA object 92 | obj = new RestaurantMenuOutput( 93 | paramColor, 94 | paramTitleString, 95 | paramUrl, 96 | paramIcon, 97 | paramValueString, 98 | paramPriceString, 99 | paramPriceCurrency, 100 | paramPriceCurrencyString, 101 | paramAddressString 102 | ) 103 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 104 | if (objectDecider.objectDecider(paramValueString)) { 105 | finalJSON.attachments.push(obj) 106 | finalMongoJSON.push(mongoObj) 107 | } 108 | } catch (e) { 109 | console.error(e) 110 | } 111 | await page.goto('about:blank') 112 | await page.close() 113 | await browser.disconnect() 114 | } 115 | module.exports.scraper = scraper 116 | -------------------------------------------------------------------------------- /scrapers/fruccola.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCompareToDb = require('./../lib/priceCompareToDb') 29 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 30 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 31 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 32 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 33 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 34 | 35 | async function scraper() { 36 | const browser = await puppeteer.connect({ browserWSEndpoint }) 37 | const page = await browser.newPage() 38 | 39 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 40 | await page.setRequestInterception(true) 41 | page.on('request', request => { 42 | if (request.resourceType() === 'image') { 43 | request.abort() 44 | } else { 45 | request.continue() 46 | } 47 | }) 48 | 49 | /* 50 | * @ FRUCCOLA 51 | * ---------------------------------------------- 52 | * contact info: 53 | * Address: Budapest, Arany János u. 32, 1051 54 | * Phone: (1) 430 6125 55 | * ---------------------------------------------- 56 | */ 57 | 58 | // @ FRUCCOLA parameters 59 | let paramColor = '#40ae49' 60 | let paramTitleString = 'Fruccola (Arany Janos utca)' 61 | let paramUrl = 'http://fruccola.hu/hu' 62 | let paramIcon = 'https://pbs.twimg.com/profile_images/295153467/fruccola_logo_rgb.png' 63 | let paramValueString 64 | let paramPriceString 65 | let paramPriceCurrency = 'HUF' 66 | let paramPriceCurrencyString = ' Ft' 67 | let paramAddressString = 'Budapest, Arany János u. 32, 1051' 68 | let dailyFruccola1 69 | let dailyFruccola2 70 | let obj = null 71 | let mongoObj = null 72 | 73 | // @ FRUCCOLA selectors 74 | const dailyFruccolaSelector1 = '#dailymenu-holder > li.arany.today > div.soup > p.description' 75 | const dailyFruccolaSelector2 = '#dailymenu-holder > li.arany.today > div.main-dish > p.description' 76 | 77 | try { 78 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 79 | // @ FRUCCOLA Daily 80 | dailyFruccola1 = await page.evaluate(el => el.innerText, await page.$(dailyFruccolaSelector1)) 81 | dailyFruccola2 = await page.evaluate(el => el.innerText, await page.$(dailyFruccolaSelector2)) 82 | paramPriceString = await page.evaluate(el => el.innerText, (await page.$$('.soup-and-maindish > .price'))[0]) // @ FRUCCOLA price catch 83 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, paramPriceString) 84 | paramPriceCurrencyString = paramPriceCurrencyString + trend 85 | paramValueString = '• Daily menu: ' + dailyFruccola1 + ', ' + dailyFruccola2 86 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 87 | console.log(paramValueString) 88 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 89 | // @ FRUCCOLA object 90 | obj = new RestaurantMenuOutput( 91 | paramColor, 92 | paramTitleString, 93 | paramUrl, 94 | paramIcon, 95 | paramValueString, 96 | paramPriceString, 97 | paramPriceCurrency, 98 | paramPriceCurrencyString, 99 | paramAddressString 100 | ) 101 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 102 | if (objectDecider.objectDecider(paramValueString)) { 103 | finalJSON.attachments.push(obj) 104 | finalMongoJSON.push(mongoObj) 105 | } 106 | } catch (e) { 107 | console.error(e) 108 | } 109 | await page.goto('about:blank') 110 | await page.close() 111 | await browser.disconnect() 112 | } 113 | module.exports.scraper = scraper 114 | -------------------------------------------------------------------------------- /scrapers/kamra.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 31 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 32 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 33 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 34 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 35 | 36 | async function scraper() { 37 | const browser = await puppeteer.connect({ browserWSEndpoint }) 38 | const page = await browser.newPage() 39 | 40 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 41 | await page.setRequestInterception(true) 42 | page.on('request', request => { 43 | if (request.resourceType() === 'image') { 44 | request.abort() 45 | } else { 46 | request.continue() 47 | } 48 | }) 49 | 50 | /* 51 | * @ KAMRA 52 | * ------------------------------------------ 53 | * contact info: 54 | * Address: Budapest, Hercegprímás u. 19, 1051 55 | * Phone: (20) 436 9968 56 | * ----------------------------------------- 57 | */ 58 | 59 | // @ KAMRA parameters 60 | let paramColor = '#fc594e' 61 | let paramTitleString = 'Kamra Ételbár' 62 | let paramUrl = 'http://www.kamraetelbar.hu/kamra_etelbar_mai_menu.html' 63 | let paramIcon = 'https://media-cdn.tripadvisor.com/media/photo-s/06/f5/9b/24/getlstd-property-photo.jpg' 64 | let paramValueString 65 | let paramPriceString 66 | let paramPriceCurrency 67 | let paramPriceCurrencyString 68 | let paramAddressString = 'Budapest, Hercegprímás u. 19, 1051' 69 | let dailyKamra = [] 70 | let obj = null 71 | let mongoObj = null 72 | 73 | // @ KAMRA selectors 74 | const dayKamraSelector = '.shop_today_1' 75 | const dailyKamraSelector = '.shop_today_title' 76 | 77 | try { 78 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 79 | // @ KAMRA Daily 80 | const dayKamra = await page.evaluate(el => el.innerText, await page.$(dayKamraSelector)) 81 | const dailyKamraSelectorLength = (await page.$$(dailyKamraSelector)).length 82 | for (let i = 0; i < dailyKamraSelectorLength; i++) { 83 | let dailyKamraItem = await page.evaluate(el => el.innerText, (await page.$$(dailyKamraSelector))[i]) 84 | dailyKamra.push(dailyKamraItem) 85 | } 86 | dailyKamra = dailyKamra.join(', ') 87 | // @ KAMRA price catch 88 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(dailyKamra) 89 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 90 | paramPriceString = price 91 | paramPriceCurrency = priceCurrency 92 | paramPriceCurrencyString = priceCurrencyStr + trend 93 | 94 | paramValueString = '• Daily menu: ' + dailyKamra 95 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 96 | console.log(dayKamra + paramValueString) 97 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 98 | // @ KAMRA object 99 | obj = new RestaurantMenuOutput( 100 | paramColor, 101 | paramTitleString, 102 | paramUrl, 103 | paramIcon, 104 | paramValueString, 105 | paramPriceString, 106 | paramPriceCurrency, 107 | paramPriceCurrencyString, 108 | paramAddressString 109 | ) 110 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 111 | if (objectDecider.objectDecider(paramValueString)) { 112 | finalJSON.attachments.push(obj) 113 | finalMongoJSON.push(mongoObj) 114 | } 115 | } catch (e) { 116 | console.error(e) 117 | } 118 | await page.goto('about:blank') 119 | await page.close() 120 | await browser.disconnect() 121 | } 122 | module.exports.scraper = scraper 123 | -------------------------------------------------------------------------------- /scrapers/karcsi.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const objectDecider = require('./../lib/objectDecider') 27 | const ocrSpaceApiSimple = require('./../lib/ocrSpaceApiSimple') 28 | const stringValueCleaner = require('./../lib/stringValueCleaner') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const today = require('./../src/date').date.today 31 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 32 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 33 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 34 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 35 | 36 | async function scraper() { 37 | /* 38 | * @ KARCSI 39 | * ------------------------------------------ 40 | * contact info: 41 | * Address: Budapest, Jókai u. 20, 1066 42 | * Phone: (1) 312 0557 43 | * ----------------------------------------- 44 | */ 45 | 46 | // @ KARCSI parameters 47 | let paramColor = '#ffba44' 48 | let paramTitleString = 'Karcsi Vendéglö' 49 | let paramUrl = 'http://karcsibacsivendeglo.com/letoltes/napi_menu.pdf' 50 | let paramIcon = 51 | 'https://6.kerulet.ittlakunk.hu/files/ittlakunk/styles/large/public/upload/company/1256/karcsi_vendeglo_logo.png' 52 | let paramValueString 53 | let paramPriceString = '1250' 54 | let paramPriceCurrency = 'HUF' 55 | let paramPriceCurrencyString = ' Ft' 56 | let paramAddressString = 'Budapest, Jókai u. 20, 1066' 57 | 58 | // @ KARCSI weekly 59 | let pdfUrl = 'http://karcsibacsivendeglo.com/letoltes/napi_menu.pdf' 60 | let weeklyOfferRegex = /\bHETI MEN.:((.*\r?\n){3})/gi 61 | let soupRegex = /\bMENÜ 1((.*\r?\n){2})/gi 62 | let karcsiDaysRegexArray = [ 63 | null, 64 | /\bHÉT((.*\r?\n))/gi, 65 | /\bKED((.*\r?\n))/gi, 66 | /\bSZERD((.*\r?\n))/gi, 67 | /\bCSÜ((.*\r?\n))/gi, 68 | /\bPÉNT((.*\r?\n))/gi 69 | ] 70 | let karcsiWeekly 71 | let karcsiSoup 72 | let karcsiDaily 73 | let parsedResult 74 | let obj = null 75 | let mongoObj = null 76 | 77 | const options = { 78 | method: 'POST', 79 | url: 'https://api.ocr.space/parse/image', 80 | headers: { 81 | apikey: process.env.OCR_API_KEY 82 | }, 83 | formData: { 84 | language: 'hun', 85 | isOverlayRequired: 'true', 86 | url: pdfUrl, 87 | scale: 'true', 88 | isTable: 'true', 89 | OCREngine: 1 90 | } 91 | } 92 | try { 93 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 94 | parsedResult = parsedResult.ParsedText 95 | 96 | for (let i = today; i < today + 1; i++) { 97 | karcsiDaily = parsedResult.match(karcsiDaysRegexArray[i]) 98 | karcsiWeekly = parsedResult.match(weeklyOfferRegex) 99 | karcsiSoup = parsedResult.match(soupRegex) 100 | } 101 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, paramPriceString) 102 | paramPriceCurrencyString = paramPriceCurrencyString + trend 103 | // @ KARCSI clean string 104 | paramValueString = 105 | '• Weekly offer: ' + 106 | (await stringValueCleaner.stringValueCleaner(karcsiWeekly, true)) + 107 | '\n• Daily menu: ' + 108 | (await stringValueCleaner.stringValueCleaner(karcsiSoup, true)) + 109 | (await stringValueCleaner.stringValueCleaner(karcsiDaily, true)) 110 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 111 | console.log(paramValueString) 112 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 113 | // @ KARCSI object 114 | obj = new RestaurantMenuOutput( 115 | paramColor, 116 | paramTitleString, 117 | paramUrl, 118 | paramIcon, 119 | paramValueString, 120 | paramPriceString, 121 | paramPriceCurrency, 122 | paramPriceCurrencyString, 123 | paramAddressString 124 | ) 125 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 126 | if (objectDecider.objectDecider(paramValueString)) { 127 | finalJSON.attachments.push(obj) 128 | finalMongoJSON.push(mongoObj) 129 | } 130 | } catch (e) { 131 | console.error(e) 132 | } 133 | } 134 | module.exports.scraper = scraper 135 | -------------------------------------------------------------------------------- /scrapers/vian.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 31 | const today = require('./../src/date').date.today 32 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 33 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 34 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 35 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 36 | 37 | async function scraper() { 38 | const browser = await puppeteer.connect({ browserWSEndpoint }) 39 | const page = await browser.newPage() 40 | 41 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 42 | await page.setRequestInterception(true) 43 | page.on('request', request => { 44 | if (request.resourceType() === 'image') { 45 | request.abort() 46 | } else { 47 | request.continue() 48 | } 49 | }) 50 | 51 | /* 52 | * @ VIAN 53 | * ------------------------------------------ 54 | * contact info: 55 | * Address: Budapest, Liszt Ferenc tér 9, 1061 56 | * Phone: (1) 268 1154 57 | * ----------------------------------------- 58 | * description: 59 | * vianArray[1-2]: contains selectors for tha days of the week 60 | * vian[1-2]: is the text inside selector (actual menu) to be displayed in output 61 | */ 62 | 63 | // @ VIAN parameters 64 | let paramColor = '#cc2b2b' 65 | let paramTitleString = 'Cafe Vian' 66 | let paramUrl = 'http://www.cafevian.com/ebedmenue' 67 | let paramIcon = 'https://static.wixstatic.com/media/d21995_af5b6ceedafd4913b3ed17f6377cdfa7~mv2.png' 68 | let paramValueString 69 | let paramPriceString 70 | let paramPriceCurrency 71 | let paramPriceCurrencyString 72 | let paramAddressString = 'Budapest, Liszt Ferenc tér 9, 1061' 73 | let vian 74 | let obj = null 75 | let mongoObj = null 76 | 77 | // @ VIAN selectors [1: first course, 2: main course] 78 | let vianSelector = 'div.heartyQ2riU' 79 | 80 | try { 81 | await page.goto(paramUrl, { waitUntil: 'domcontentloaded', timeout: 0 }) 82 | let linkSelectorVian = '#TPASection_jkic76na > iframe' 83 | const linkVian = await page.evaluate(el => el.src, (await page.$$(linkSelectorVian))[0]) 84 | await page.goto(linkVian, { waitUntil: 'domcontentloaded', timeout: 0 }) 85 | } catch (e) { 86 | console.error(e) 87 | } 88 | // @ VIAN Monday-Friday 89 | try { 90 | if ((await page.$(vianArray1[i])) !== null) { 91 | vian = await page.evaluate(el => el.innerText, (await page.$(vianSelector))[today - 1]) 92 | } else { 93 | vian = '♪"No Milk Today"♫' 94 | } 95 | const body = await page.evaluate(el => el.textContent, (await page.$$('#mainDiv'))[0]) 96 | // @ VIAN price catch 97 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(body) 98 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 99 | 100 | paramPriceString = price 101 | paramPriceCurrency = priceCurrency 102 | paramPriceCurrencyString = priceCurrencyStr + trend 103 | 104 | paramValueString = '• Daily menu: ' + vian 105 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 106 | console.log(paramValueString) 107 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 108 | // @ VIAN object 109 | obj = new RestaurantMenuOutput( 110 | paramColor, 111 | paramTitleString, 112 | paramUrl, 113 | paramIcon, 114 | paramValueString, 115 | paramPriceString, 116 | paramPriceCurrency, 117 | paramPriceCurrencyString, 118 | paramAddressString 119 | ) 120 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 121 | if (objectDecider.objectDecider(paramValueString)) { 122 | finalJSON.attachments.push(obj) 123 | finalMongoJSON.push(mongoObj) 124 | } 125 | } catch (e) { 126 | console.error(e) 127 | } 128 | await page.goto('about:blank') 129 | await page.close() 130 | await browser.disconnect() 131 | } 132 | module.exports.scraper = scraper 133 | -------------------------------------------------------------------------------- /scrapers/bodza.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const priceCatcher = require('./../lib/priceCatcher') 28 | const objectDecider = require('./../lib/objectDecider') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 31 | const todayDotSeparated = require('./../src/date').date.todayDotSeparated 32 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 33 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 34 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 35 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 36 | 37 | async function scraper() { 38 | const browser = await puppeteer.connect({ browserWSEndpoint }) 39 | const page = await browser.newPage() 40 | 41 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 42 | await page.setRequestInterception(true) 43 | page.on('request', request => { 44 | if (request.resourceType() === 'image') { 45 | request.abort() 46 | } else { 47 | request.continue() 48 | } 49 | }) 50 | 51 | /* 52 | * @ BODZA BISTRO 53 | * ------------------------------------------ 54 | * contact info: 55 | * Address: Budapest, Bajcsy-Zsilinszky út 12, 1051 56 | * Phone: 06 (30) 515-52-34 57 | * ----------------------------------------- 58 | */ 59 | 60 | // @ BODZA parameters 61 | let paramColor = '#c7ef81' 62 | let paramTitleString = 'Bodza bistro' 63 | let paramUrl = 'http://bodzabistro.hu/heti-menu/' 64 | let paramIcon = 'http://bodzabistro.hu/wp-content/uploads/2016/03/nevtelen-1.png' 65 | let paramSelector = '.container' 66 | let paramValueString 67 | let paramPriceString 68 | let paramPriceCurrency 69 | let paramPriceCurrencyString 70 | let paramAddressString = 'Budapest, Bajcsy-Zsilinszky út 12, 1051' 71 | let bodzaDaily 72 | let obj = null 73 | let mongoObj = null 74 | 75 | try { 76 | await page.goto(paramUrl, { waitUntil: 'domcontentloaded', timeout: 0 }) 77 | // @ BODZA selectors 78 | let bodzaBlock = await page.$$(paramSelector) 79 | // @ BODZA Monday-Friday 80 | forlabelBodza: for (let i = 0; i < bodzaBlock.length; i++) { 81 | bodzaDaily = await page.evaluate(el => el.textContent, (await page.$$(paramSelector))[i]) 82 | if (bodzaDaily.match(todayDotSeparated)) { 83 | // @ BODZA price catch 84 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(bodzaDaily) 85 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 86 | paramPriceString = price 87 | paramPriceCurrency = priceCurrency 88 | paramPriceCurrencyString = priceCurrencyStr + trend 89 | 90 | bodzaDaily = bodzaDaily.match(/(.*)CHEF NAPI AJÁNLATA(.*\r?\n){3}/gi) 91 | bodzaDaily = bodzaDaily 92 | .join() 93 | .replace(/(\r?\n)/gm, ' ') 94 | .replace(/\s\s+/gm, ' ') 95 | .replace(/(.*)CHEF NAPI AJÁNLATA/g, '') 96 | 97 | break forlabelBodza 98 | } 99 | bodzaDaily = '♪"No Milk Today"♫' 100 | // sorry about this one :( 101 | paramPriceString = 'n/a' 102 | paramPriceCurrency = 'n/a' 103 | paramPriceCurrencyString = '' 104 | } 105 | paramValueString = '• Daily menu: ' + bodzaDaily 106 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 107 | console.log(paramValueString) 108 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 109 | // @ BODZA object 110 | obj = new RestaurantMenuOutput( 111 | paramColor, 112 | paramTitleString, 113 | paramUrl, 114 | paramIcon, 115 | paramValueString, 116 | paramPriceString, 117 | paramPriceCurrency, 118 | paramPriceCurrencyString, 119 | paramAddressString 120 | ) 121 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 122 | if (objectDecider.objectDecider(paramValueString)) { 123 | finalJSON.attachments.push(obj) 124 | finalMongoJSON.push(mongoObj) 125 | } 126 | } catch (e) { 127 | console.error(e) 128 | } 129 | await page.goto('about:blank') 130 | await page.close() 131 | await browser.disconnect() 132 | } 133 | module.exports.scraper = scraper 134 | -------------------------------------------------------------------------------- /scrapers/yamato.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const dateCatcher = require('./../lib/dateCatcher') 28 | const objectDecider = require('./../lib/objectDecider') 29 | const priceCatcher = require('./../lib/priceCatcher') 30 | const priceCompareToDb = require('./../lib/priceCompareToDb') 31 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 32 | const today = require('./../src/date').date.today 33 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 34 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 35 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 36 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 37 | 38 | async function scraper() { 39 | const browser = await puppeteer.connect({ browserWSEndpoint }) 40 | const page = await browser.newPage() 41 | 42 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 43 | await page.setRequestInterception(true) 44 | page.on('request', request => { 45 | if (request.resourceType() === 'image') { 46 | request.abort() 47 | } else { 48 | request.continue() 49 | } 50 | }) 51 | 52 | /* 53 | * @ YAMATO 54 | * --------------------------------------- 55 | * contact info: 56 | * Address: Budapest, 1066, JÓKAI U. 30. 57 | * Phone: +36(70)681-75-44 58 | * --------------------------------------- 59 | * description: 60 | * yamatoArray: contains selectors for tha days of the week 61 | * yamato: is the text inside selector (actual menu), and also the final cleaned text to be displayed in output 62 | */ 63 | 64 | // @ YAMATO parameters 65 | let paramColor = '#cca92b' 66 | let paramTitleString = 'Yamato' 67 | let paramUrl = 'https://www.wasabi.hu/napimenu.php?source=yamato&lang=hu' 68 | let paramIcon = 'http://yamatorestaurant.hu/wp-content/uploads/2014/12/yamato_logo_retina.png' 69 | let paramValueString 70 | let paramPriceString 71 | let paramPriceCurrency 72 | let paramPriceCurrencyString 73 | let paramAddressString = 'Budapest, 1066, Jókai u. 30.' 74 | let yamato 75 | let found 76 | let obj = null 77 | let mongoObj = null 78 | 79 | // @ YAMATO selectors 80 | let yamatoSelector = '.fr-tag' 81 | let yamatoArray = [0, 1, 3, 5, 7, 9] 82 | 83 | try { 84 | await page.goto(paramUrl, { waitUntil: 'domcontentloaded', timeout: 0 }) 85 | const theWhole = await page.evaluate(el => el.textContent, await page.$('body')) 86 | found = await dateCatcher.dateCatcher(theWhole) // @ YAMATO catch date 87 | // @ YAMATO price catch 88 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(theWhole) 89 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 90 | 91 | paramPriceString = price 92 | paramPriceCurrency = priceCurrency 93 | paramPriceCurrencyString = priceCurrencyStr + trend 94 | // @ YAMATO Monday-Friday 95 | for (let i = today; i < today + 1; i++) { 96 | if (found === true) { 97 | yamato = await page.evaluate(el => el.textContent, (await page.$$(yamatoSelector))[yamatoArray[i]]) 98 | yamato = yamato.replace(/(\r?\n)/gm, ', ') 99 | } else { 100 | yamato = '♪"No Milk Today"♫' 101 | } 102 | paramValueString = '• Daily menu: ' + yamato 103 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 104 | console.log(paramValueString) 105 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 106 | // @ YAMATO object 107 | obj = new RestaurantMenuOutput( 108 | paramColor, 109 | paramTitleString, 110 | paramUrl, 111 | paramIcon, 112 | paramValueString, 113 | paramPriceString, 114 | paramPriceCurrency, 115 | paramPriceCurrencyString, 116 | paramAddressString 117 | ) 118 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 119 | if (objectDecider.objectDecider(paramValueString)) { 120 | finalJSON.attachments.push(obj) 121 | finalMongoJSON.push(mongoObj) 122 | } 123 | } 124 | } catch (e) { 125 | console.error(e) 126 | } 127 | await page.goto('about:blank') 128 | await page.close() 129 | await browser.disconnect() 130 | } 131 | module.exports.scraper = scraper 132 | -------------------------------------------------------------------------------- /scrapers/ketszerecsen.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const stringValueCleaner = require('./../lib/stringValueCleaner') 31 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 32 | const today = require('./../src/date').date.today 33 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 34 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 35 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 36 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 37 | 38 | async function scraper() { 39 | const browser = await puppeteer.connect({ browserWSEndpoint }) 40 | const page = await browser.newPage() 41 | 42 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 43 | await page.setRequestInterception(true) 44 | page.on('request', request => { 45 | if (request.resourceType() === 'image') { 46 | request.abort() 47 | } else { 48 | request.continue() 49 | } 50 | }) 51 | 52 | /* 53 | * @ KETSZERECSEN 54 | * ------------------------------------------ 55 | * contact info: 56 | * Address: Budapest, Nagymező u. 14, 1065 57 | * Phone: (1) 343 1984 58 | * ----------------------------------------- 59 | * description: 60 | * ketszerecsenArray[1-2]: contains selectors for tha days of the week 61 | * ketszerecsen[1-2]: is the text inside selector (actual menu) to be displayed in output 62 | */ 63 | 64 | // @ KETSZERECSEN parameters 65 | let paramColor = '#000000' 66 | let paramTitleString = 'Két Szerecsen Bisztro' 67 | let paramUrl = 'https://ketszerecsen.hu/#daily' 68 | let paramIcon = 'https://images.deliveryhero.io/image/netpincer/caterer/sh-9a3e84d0-2e42-11e2-9d48-7a92eabdcf20/logo.png' 69 | let paramValueString 70 | let paramPriceString 71 | let paramPriceCurrency 72 | let paramPriceCurrencyString 73 | let paramAddressString = 'Budapest, Nagymező u. 14, 1065' 74 | let ketszerecsen 75 | let obj = null 76 | let mongoObj = null 77 | 78 | // @ KETSZERECSEN RegEx expressions 79 | const ketszerecsenArray = [ 80 | null, 81 | /hétfő((.*)\r?\n+){4}/gim, 82 | /kedd((.*)\r?\n+){4}/gim, 83 | /szerda((.*)\r?\n+){4}/gim, 84 | /csütörtök((.*)\r?\n+){4}/gim, 85 | /péntek((.*)\r?\n+){4}/gim 86 | ] 87 | 88 | try { 89 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 90 | const body = await page.evaluate(el => el.innerText, (await page.$$('body'))[0]) 91 | // @ KETSZERECSEN Monday-Friday 92 | body.match(ketszerecsenArray[today]) 93 | ? (ketszerecsen = body.match(ketszerecsenArray[today])[0]) 94 | : (ketszerecsen = '♪"No Milk Today"♫') 95 | 96 | // @ KETSZERECSEN price catch 97 | console.log('\n') 98 | const { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(body) 99 | const trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 100 | paramPriceString = price 101 | paramPriceCurrency = priceCurrency 102 | paramPriceCurrencyString = priceCurrencyStr + trend 103 | 104 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(ketszerecsen, true)) 105 | paramValueString += '\n_Rövidítések: GM — gluténmentes, LM — laktózmentes_' 106 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 107 | console.log(paramValueString) 108 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 109 | // @ KETSZERECSEN object 110 | obj = new RestaurantMenuOutput( 111 | paramColor, 112 | paramTitleString, 113 | paramUrl, 114 | paramIcon, 115 | paramValueString, 116 | paramPriceString, 117 | paramPriceCurrency, 118 | paramPriceCurrencyString, 119 | paramAddressString 120 | ) 121 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 122 | if (objectDecider.objectDecider(paramValueString)) { 123 | finalJSON.attachments.push(obj) 124 | finalMongoJSON.push(mongoObj) 125 | } 126 | } catch (e) { 127 | console.error(e) 128 | } 129 | await page.goto('about:blank') 130 | await page.close() 131 | await browser.disconnect() 132 | } 133 | module.exports.scraper = scraper 134 | -------------------------------------------------------------------------------- /scrapers/menza.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const dateCatcher = require('./../lib/dateCatcher') 31 | const stringValueCleaner = require('./../lib/stringValueCleaner') 32 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 33 | const today = require('./../src/date').date.today 34 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 35 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 36 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 37 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 38 | 39 | async function scraper() { 40 | const browser = await puppeteer.connect({ browserWSEndpoint }) 41 | const page = await browser.newPage() 42 | 43 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 44 | await page.setRequestInterception(true) 45 | page.on('request', request => { 46 | if (request.resourceType() === 'image') { 47 | request.abort() 48 | } else { 49 | request.continue() 50 | } 51 | }) 52 | 53 | /* 54 | * @ MENZA 55 | * ------------------------------------------ 56 | * contact info: 57 | * Address: 1061 Budapest, Liszt Ferenc tér 2. 58 | * Phone: +36 30 145 4242 59 | * ----------------------------------------- 60 | */ 61 | 62 | // @ KAMRA parameters 63 | let paramColor = '#be8e8e' 64 | let paramTitleString = 'Menza' 65 | let paramUrl = 'https://menzaetterem.hu/etlap/' 66 | let paramIcon = 'https://menzaetterem.hu/site/themes/menza/typerocket/wordpress/assets/images/favicon-32x32.png' 67 | let paramValueString 68 | let paramPriceString 69 | let paramPriceCurrency 70 | let paramPriceCurrencyString 71 | let paramAddressString = '1061 Budapest, Liszt Ferenc tér 2.' 72 | let dailyMenza = [] 73 | let obj = null 74 | let mongoObj = null 75 | 76 | // @ MENZA RegEx expressions 77 | const menzaArray = [ 78 | null, 79 | /hétfő((.*)\r?\n+){7}/gim, 80 | /kedd((.*)\r?\n+){7}/gim, 81 | /szerda((.*)\r?\n+){7}/gim, 82 | /csütörtök((.*)\r?\n+){7}/gim, 83 | /péntek((.*)\r?\n+){7}/gim 84 | ] 85 | 86 | try { 87 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 88 | const body = await page.evaluate(el => el.innerText, (await page.$$('body'))[0]) 89 | // @ MENZA Monday-Friday 90 | body.match(menzaArray[today]) ? (dailyMenza = body.match(menzaArray[today])[0]) : (dailyMenza = '♪"No Milk Today"♫') 91 | 92 | // @ MENZA price catch 93 | console.log('\n') 94 | const { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(body) 95 | const trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 96 | paramPriceString = price 97 | paramPriceCurrency = priceCurrency 98 | paramPriceCurrencyString = priceCurrencyStr + trend 99 | 100 | // @ MENZA date catch 101 | const found = await dateCatcher.dateCatcher(body, true) 102 | 103 | // @ MENZA build output 104 | if (found === true) { 105 | paramValueString = dailyMenza 106 | paramValueString = 107 | '• Daily menu: ' + 108 | (await stringValueCleaner.stringValueCleaner(paramValueString, false)) + 109 | '\n_Főbb étel allergének: A — glutén, B — rákfélék, C — tojás, D — hal, E — földimogyoró, F — szója, G — tej, H — diófélék, I — zeller, J — mustár, K — szezámmag, L — kén-dioxid, M — csillagfürt, N — puhatestűek, V — vegán_' 110 | } else { 111 | paramValueString = '• Daily menu: ♪"No Milk Today"♫' 112 | } 113 | 114 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 115 | console.log(paramValueString) 116 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 117 | // @ MENZA object 118 | obj = new RestaurantMenuOutput( 119 | paramColor, 120 | paramTitleString, 121 | paramUrl, 122 | paramIcon, 123 | paramValueString, 124 | paramPriceString, 125 | paramPriceCurrency, 126 | paramPriceCurrencyString, 127 | paramAddressString 128 | ) 129 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 130 | if (objectDecider.objectDecider(paramValueString)) { 131 | finalJSON.attachments.push(obj) 132 | finalMongoJSON.push(mongoObj) 133 | } 134 | } catch (e) { 135 | console.error(e) 136 | } 137 | await page.goto('about:blank') 138 | await page.close() 139 | await browser.disconnect() 140 | } 141 | module.exports.scraper = scraper 142 | -------------------------------------------------------------------------------- /scrapers/bank3.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const ocrSpaceApiSimple = require('./../lib/ocrSpaceApiSimple') 28 | const dateCatcher = require('./../lib/dateCatcher') 29 | const objectDecider = require('./../lib/objectDecider') 30 | const priceCatcher = require('./../lib/priceCatcher') 31 | const priceCompareToDb = require('./../lib/priceCompareToDb') 32 | const stringValueCleaner = require('./../lib/stringValueCleaner') 33 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 34 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 35 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 36 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 37 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 38 | 39 | async function scraper() { 40 | const browser = await puppeteer.connect({ browserWSEndpoint }) 41 | const page = await browser.newPage() 42 | 43 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 44 | await page.setRequestInterception(true) 45 | page.on('request', request => { 46 | if (request.resourceType() === 'image') { 47 | request.abort() 48 | } else { 49 | request.continue() 50 | } 51 | }) 52 | 53 | /* 54 | * @ BANK 3 55 | * ------------------------------------------ 56 | * contact info: 57 | * Address: Budapest, Bank u. 3, 1054 58 | * Phone: +36 (1) 788 2775 59 | * ----------------------------------------- 60 | */ 61 | 62 | // @ BANK 3 parameters 63 | let paramColor = '#000000' 64 | let paramTitleString = 'Bank 3' 65 | let paramUrl = 'http://www.bank3.hu/' 66 | let paramIcon = 'http://m.bank3.hu/ogs/sh-b6fc80e2-df8b-c2e5-1000-e056251f1792_logo_200x200.png?ver=1474612940' 67 | let paramValueString 68 | let paramPriceString 69 | let paramPriceCurrency 70 | let paramPriceCurrencyString 71 | let paramAddressString = 'Budapest, Bank u. 3, 1054' 72 | let bank3Date 73 | let found 74 | let trend 75 | let obj = null 76 | let mongoObj = null 77 | 78 | // @ BANK 3 selectors 79 | const bank3Selector = '.lightbox > img' 80 | const bank3DateSelector = '.text-right' 81 | 82 | try { 83 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 84 | bank3ImgUrl = await page.evaluate(el => el.src, (await page.$$(bank3Selector))[0]) 85 | bank3Date = await page.evaluate(el => el.textContent, (await page.$$(bank3DateSelector))[0]) 86 | 87 | // @ BANK 3 OCR 88 | const options = { 89 | method: 'POST', 90 | url: 'https://api.ocr.space/parse/image', 91 | headers: { 92 | apikey: process.env.OCR_API_KEY 93 | }, 94 | formData: { 95 | language: 'hun', 96 | isOverlayRequired: 'true', 97 | url: bank3ImgUrl, 98 | scale: 'true', 99 | isTable: 'true', 100 | OCREngine: 1 101 | } 102 | } 103 | try { 104 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 105 | parsedResult = parsedResult.ParsedText 106 | } catch (e) { 107 | console.error(e) 108 | } 109 | 110 | // @ BANK 3 price catch 111 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(parsedResult, 1) 112 | trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 113 | paramPriceString = price 114 | paramPriceCurrency = priceCurrency 115 | paramPriceCurrencyString = priceCurrencyStr + trend 116 | 117 | // @ BANK 3 date catch 118 | found = await dateCatcher.dateCatcher(bank3Date) 119 | 120 | // @ BANK 3 menu parse 121 | if (found === true) { 122 | paramValueString = parsedResult 123 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, false)) 124 | } else { 125 | paramValueString = '• Daily menu: ♪"No Milk Today"♫' 126 | } 127 | 128 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 129 | console.log(paramValueString) 130 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 131 | 132 | // @ BANK 3 object 133 | obj = new RestaurantMenuOutput( 134 | paramColor, 135 | paramTitleString, 136 | paramUrl, 137 | paramIcon, 138 | paramValueString, 139 | paramPriceString, 140 | paramPriceCurrency, 141 | paramPriceCurrencyString, 142 | paramAddressString 143 | ) 144 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 145 | if (objectDecider.objectDecider(paramValueString)) { 146 | finalJSON.attachments.push(obj) 147 | finalMongoJSON.push(mongoObj) 148 | } 149 | } catch (e) { 150 | console.error(e) 151 | } 152 | await page.goto('about:blank') 153 | await page.close() 154 | await browser.disconnect() 155 | } 156 | module.exports.scraper = scraper 157 | -------------------------------------------------------------------------------- /scrapers/suppe.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCompareToDb = require('./../lib/priceCompareToDb') 29 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 30 | const today = require('./../src/date').date.today 31 | const dayNames = require('./../src/date').date.dayNames 32 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 33 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 34 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 35 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 36 | 37 | async function scraper() { 38 | const browser = await puppeteer.connect({ browserWSEndpoint }) 39 | const page = await browser.newPage() 40 | 41 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 42 | await page.setRequestInterception(true) 43 | page.on('request', request => { 44 | if (request.resourceType() === 'image') { 45 | request.abort() 46 | } else { 47 | request.continue() 48 | } 49 | }) 50 | 51 | /* 52 | * @ SUPPÉ bistro 53 | * --------------------------------------- 54 | * contact info: 55 | * Address: Hajós u. 19 (19.45 mi), Budapest, Hungary 1065 56 | * Phone: (70) 336 0822 57 | * --------------------------------------- 58 | * Description: 59 | * scrape facebook posts based on xpath patterns 60 | * todo: avoid xpath and use selectors 61 | * replace redundant string patterns with regex 62 | */ 63 | 64 | // @ SUPPÉ parameters 65 | let paramColor = '#b5dd8d' 66 | let paramTitleString = 'Bistro Suppé' 67 | let paramUrl = 'https://www.facebook.com/pg/bistrosuppe/posts/' 68 | let paramIcon = 69 | 'https://scontent.fbud5-1.fna.fbcdn.net/v/t1.0-9/1377248_364465010354681_215635093_n.jpg?_nc_cat=101&_nc_oc=AQm91PjrSi-ey80DSDwdQ3M3QHzUeuVWy-oElgtNm3nn2HdoSNFxNcRZGwQPDG2Hkmo&_nc_ht=scontent.fbud5-1.fna&oh=925e1c2bb1782f4ab82cd02ef911ecc1&oe=5E254656' 70 | let paramValueString 71 | let paramPriceString = '1190' 72 | let paramPriceCurrency = 'HUF' 73 | let paramPriceCurrencyString = ' Ft' 74 | let paramAddressString = 'Budapest, Hajós u. 19, 1065' 75 | let mondaySuppe 76 | let dailySuppe 77 | let weeklySuppe 78 | let obj = null 79 | let mongoObj = null 80 | 81 | try { 82 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 83 | /* 84 | * @ SUPPÉ selector, source: https://stackoverflow.com/questions/48448586/how-to-use-xpath-in-chrome-headlesspuppeteer-evaluate 85 | * @ SUPPÉ Daily 86 | */ 87 | const dailySuppeIncludes = (await page.$x('//span[contains(text(), "Sziasztok")]'))[0] 88 | dailySuppe = await page.evaluate(el => el.textContent, dailySuppeIncludes) 89 | dailySuppe = dailySuppe.replace(/Sziasztok, |, kellemes hétvégét!|, szép napot!|, várunk Titeket!/gi, '') 90 | // @ SUPPÉ Weekly (on Monday) 91 | const weeklySuppeIncludes = (await page.$x('//p[contains(text(), "Sziasztok")]'))[0] 92 | weeklySuppe = await page.evaluate(el => el.textContent, weeklySuppeIncludes) 93 | weeklySuppe = weeklySuppe.replace(/(?=sziasztok)(.*)(?=levesek )|(?=mai)(.*)(?=\s*)/gi, '') 94 | // @ SUPPÉ Monday only (on Monday) 95 | const mondaySuppeIncludes = (await page.$x('//p[contains(text(), "Sziasztok")]'))[0] 96 | mondaySuppe = await page.evaluate(el => el.textContent, mondaySuppeIncludes) 97 | mondaySuppe = mondaySuppe.replace(/(?=sziasztok)(.*)(?=levesek )|(, várunk Titeket!)/gi, '') 98 | 99 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, paramPriceString) 100 | paramPriceCurrencyString = paramPriceCurrencyString + trend 101 | 102 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 103 | if (today === 1) { 104 | paramValueString = mondaySuppe 105 | console.log('• ' + dayNames[today] + ': ' + paramValueString) 106 | } else { 107 | paramValueString = dailySuppe + '\n' + weeklySuppe 108 | console.log('• ' + dayNames[today] + ': ' + paramValueString) 109 | } 110 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 111 | // @ SUPPÉ object 112 | obj = new RestaurantMenuOutput( 113 | paramColor, 114 | paramTitleString, 115 | paramUrl, 116 | paramIcon, 117 | paramValueString, 118 | paramPriceString, 119 | paramPriceCurrency, 120 | paramPriceCurrencyString, 121 | paramAddressString 122 | ) 123 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 124 | if (objectDecider.objectDecider(paramValueString)) { 125 | finalJSON.attachments.push(obj) 126 | finalMongoJSON.push(mongoObj) 127 | } 128 | } catch (e) { 129 | console.error(e) 130 | } 131 | await page.goto('about:blank') 132 | await page.close() 133 | await browser.disconnect() 134 | } 135 | module.exports.scraper = scraper 136 | -------------------------------------------------------------------------------- /scrapers/mozsar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const moment = require('moment') 28 | const dateCatcher = require('./../lib/dateCatcher') 29 | const objectDecider = require('./../lib/objectDecider') 30 | const priceCatcher = require('./../lib/priceCatcher') 31 | const priceCompareToDb = require('./../lib/priceCompareToDb') 32 | const stringValueCleaner = require('./../lib/stringValueCleaner') 33 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 34 | const today = require('./../src/date').date.today 35 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 36 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 37 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 38 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 39 | 40 | async function scraper() { 41 | const browser = await puppeteer.connect({ browserWSEndpoint }) 42 | const page = await browser.newPage() 43 | 44 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 45 | await page.setRequestInterception(true) 46 | page.on('request', request => { 47 | if (request.resourceType() === 'image') { 48 | request.abort() 49 | } else { 50 | request.continue() 51 | } 52 | }) 53 | 54 | /* 55 | * @ MOZSAR 56 | * ------------------------------------------ 57 | * contact info: 58 | * Address: Budapest, Nagymező u. 21, 1065 59 | * Phone: +36 (70) 426 8199 60 | * ----------------------------------------- 61 | */ 62 | 63 | // @ MOZSAR parameters 64 | let paramColor = '#bc4545' 65 | let paramTitleString = 'Mozsár Bisztro' 66 | let paramUrl = 'http://mozsarbisztro.hu/index.php?p=3' 67 | let paramIcon = 68 | 'https://www.programturizmus.hu/media/image/big/ajanlat/program/tudomanyos-programok/tanfolyam/76/19396-mozsar-kavezo-program.jpg' 69 | let paramValueString 70 | let paramPriceString 71 | let paramPriceCurrency 72 | let paramPriceCurrencyString 73 | let paramAddressString = 'Budapest, Nagymező u. 21, 1065' 74 | let mozsar 75 | let mozsarDate 76 | let mozsarSummary 77 | let found 78 | let trend 79 | let obj = null 80 | let mongoObj = null 81 | 82 | // @ MOZSAR selectors 83 | const mozsarSelector = '#etlapresult' 84 | const mozsarDateSelector = '.flipInY' 85 | const mozsarPriceSelector = '.item' 86 | const mozsarDaysRegexArray = [ 87 | null, 88 | /hétfő(.*\r?\n){2}/gi, 89 | /kedd(.*\r?\n){2}/gi, 90 | /szerda(.*\r?\n){2}/gi, 91 | /csütörtök(.*\r?\n){2}/gi, 92 | /péntek(.*\r?\n){2}/gi 93 | ] 94 | 95 | try { 96 | await page.goto(paramUrl, { waitUntil: 'networkidle2' }) 97 | mozsar = await page.evaluate(el => el.textContent, (await page.$$(mozsarSelector))[0]) 98 | mozsarDate = await page.evaluate(el => el.textContent, (await page.$$(mozsarDateSelector))[0]) 99 | mozsarSummary = await page.evaluate(el => el.textContent, (await page.$$(mozsarPriceSelector))[0]) 100 | 101 | // @ MOZSAR price catch 102 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(mozsarSummary) 103 | trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 104 | paramPriceString = price 105 | paramPriceCurrency = priceCurrency 106 | paramPriceCurrencyString = priceCurrencyStr + trend 107 | 108 | // @ MOZSAR date catch 109 | mozsarDate = mozsarDate.replace(/Heti menü |- /gi, moment().year() + '.').replace(/\. /gi, '. - ') 110 | found = await dateCatcher.dateCatcher(mozsarDate, true) 111 | 112 | // @ MOZSAR menu parse 113 | if (found === true) { 114 | paramValueString = mozsar.match(mozsarDaysRegexArray[today]) 115 | paramValueString = paramValueString.toString().replace(/ Ital(\r?\n)/g, '') // ugly, but cleaner value for Fridays from necessary regex 116 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, false)) 117 | } else { 118 | paramValueString = '• Daily menu: ♪"No Milk Today"♫' 119 | } 120 | 121 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 122 | console.log(paramValueString) 123 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 124 | 125 | // @ MOZSAR object 126 | obj = new RestaurantMenuOutput( 127 | paramColor, 128 | paramTitleString, 129 | paramUrl, 130 | paramIcon, 131 | paramValueString, 132 | paramPriceString, 133 | paramPriceCurrency, 134 | paramPriceCurrencyString, 135 | paramAddressString 136 | ) 137 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 138 | if (objectDecider.objectDecider(paramValueString)) { 139 | finalJSON.attachments.push(obj) 140 | finalMongoJSON.push(mongoObj) 141 | } 142 | } catch (e) { 143 | console.error(e) 144 | } 145 | await page.goto('about:blank') 146 | await page.close() 147 | await browser.disconnect() 148 | } 149 | module.exports.scraper = scraper 150 | -------------------------------------------------------------------------------- /scrapers/korhely.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const dateCatcher = require('./../lib/dateCatcher') 28 | const objectDecider = require('./../lib/objectDecider') 29 | const priceCatcher = require('./../lib/priceCatcher') 30 | const priceCompareToDb = require('./../lib/priceCompareToDb') 31 | const stringValueCleaner = require('./../lib/stringValueCleaner') 32 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 33 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 34 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 35 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 36 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 37 | 38 | async function scraper() { 39 | const browser = await puppeteer.connect({ browserWSEndpoint }) 40 | const page = await browser.newPage() 41 | 42 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 43 | await page.setRequestInterception(true) 44 | page.on('request', request => { 45 | if (request.resourceType() === 'image') { 46 | request.abort() 47 | } else { 48 | request.continue() 49 | } 50 | }) 51 | 52 | /* 53 | * @ KORHELY 54 | * --------------------------------------------- 55 | * contact info: 56 | * Address: Budapest, Liszt Ferenc tér 7, 1061 57 | * Phone: (1) 321 0280 58 | * --------------------------------------------- 59 | */ 60 | 61 | // @ KORHELY parameters 62 | let paramColor = '#c6b443' 63 | let paramTitleString = 'Korhely' 64 | let paramUrl = 'http://www.korhelyfaloda.hu/menu' 65 | let paramIcon = 'https://etterem.hu/img/max960/p9787n/1393339359-3252.jpg' 66 | let paramValueString 67 | let paramPriceString 68 | let paramPriceCurrency 69 | let paramPriceCurrencyString 70 | let paramAddressString = 'Budapest, Liszt Ferenc tér 7, 1061' 71 | let weeklySoupKorhely 72 | let weeklyMainKorhely 73 | let weeklyDessertKorhely 74 | let found 75 | let obj = null 76 | let mongoObj = null 77 | 78 | // @ KORHELY selectors 79 | const summarySelector = '.menusNavigation_description' 80 | const weeklySoupKorhelySelector = '#subCategory1' 81 | const weeklyMainKorhelySelector = '#subCategory2' 82 | const weeklyDessertKorhelySelector = '#subCategory3' 83 | 84 | try { 85 | await page.goto(paramUrl, { waitUntil: 'networkidle2', timeout: 0 }) 86 | let linkSelectorKorhely = '#TPASection_ije2yufiiframe' 87 | const linkKorhely = await page.evaluate(el => el.src, await page.$(linkSelectorKorhely)) 88 | await page.goto(linkKorhely, { waitUntil: 'networkidle2', timeout: 0 }) 89 | } catch (e) { 90 | console.error(e) 91 | } 92 | // @ KORHELY Weekly 93 | try { 94 | const summary = await page.evaluate(el => el.textContent, (await page.$$(summarySelector))[1]) 95 | // @ KORHELY price catch 96 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(summary) 97 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 98 | paramPriceString = price 99 | paramPriceCurrency = priceCurrency 100 | paramPriceCurrencyString = priceCurrencyStr + trend 101 | 102 | found = await dateCatcher.dateCatcher(summary, true) 103 | if (found === true) { 104 | weeklySoupKorhely = await page.evaluate(el => el.innerText, (await page.$$(weeklySoupKorhelySelector))[1]) 105 | weeklyMainKorhely = await page.evaluate(el => el.innerText, (await page.$$(weeklyMainKorhelySelector))[1]) 106 | weeklyDessertKorhely = await page.evaluate(el => el.innerText, (await page.$$(weeklyDessertKorhelySelector))[1]) 107 | 108 | paramValueString = 109 | '• Soups: ' + 110 | (await stringValueCleaner.stringValueCleaner(weeklySoupKorhely, false)) + 111 | '\n' + 112 | '• Main courses: ' + 113 | (await stringValueCleaner.stringValueCleaner(weeklyMainKorhely, false)) + 114 | '\n' + 115 | '• Desserts: ' + 116 | (await stringValueCleaner.stringValueCleaner(weeklyDessertKorhely, false)) 117 | } else { 118 | paramValueString = 'menu is outdated!' 119 | } 120 | 121 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 122 | console.log(paramValueString) 123 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 124 | // @ KORHELY object 125 | obj = new RestaurantMenuOutput( 126 | paramColor, 127 | paramTitleString, 128 | paramUrl, 129 | paramIcon, 130 | paramValueString, 131 | paramPriceString, 132 | paramPriceCurrency, 133 | paramPriceCurrencyString, 134 | paramAddressString 135 | ) 136 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 137 | if (objectDecider.objectDecider(paramValueString)) { 138 | finalJSON.attachments.push(obj) 139 | finalMongoJSON.push(mongoObj) 140 | } 141 | } catch (e) { 142 | console.error(e) 143 | } 144 | await page.goto('about:blank') 145 | await page.close() 146 | await browser.disconnect() 147 | } 148 | module.exports.scraper = scraper 149 | -------------------------------------------------------------------------------- /scrapers/i55.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const dateCatcher = require('./../lib/dateCatcher') 28 | const objectDecider = require('./../lib/objectDecider') 29 | const priceCatcher = require('./../lib/priceCatcher') 30 | const priceCompareToDb = require('./../lib/priceCompareToDb') 31 | const stringValueCleaner = require('./../lib/stringValueCleaner') 32 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 33 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 34 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 35 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 36 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 37 | 38 | async function scraper() { 39 | const browser = await puppeteer.connect({ browserWSEndpoint }) 40 | const page = await browser.newPage() 41 | 42 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 43 | await page.setRequestInterception(true) 44 | page.on('request', request => { 45 | if (request.resourceType() === 'image') { 46 | request.abort() 47 | } else { 48 | request.continue() 49 | } 50 | }) 51 | 52 | /* 53 | * @ I55 American Restaurant 54 | * ------------------------------------------ 55 | * contact info: 56 | * Address: Budapest, Alkotmány u. 20, 1054 57 | * Phone: (1) 400 9580 58 | * ----------------------------------------- 59 | */ 60 | 61 | // @ I55 parameters 62 | let paramColor = '#104283' 63 | let paramTitleString = 'I55' 64 | let paramUrl = 'http://i55.hu/ebedmenu/' 65 | let paramUrlFallback = 'https://www.facebook.com/pg/i55americanrestaurant/posts/' 66 | let paramIcon = 'https://scontent.fbud5-1.fna.fbcdn.net/v/t39.30808-1/324270544_737231364791960_1074046147067895687_n.png?stp=dst-png_p200x200&_nc_cat=108&ccb=1-7&_nc_sid=c6021c&_nc_ohc=GvK2WApiMiwAX_HB-xW&_nc_ht=scontent.fbud5-1.fna&oh=00_AfA8-klI26XAAxlTQ_ghlQr73vKi3wjT9SHFltuJIXEE2g&oe=63F7744A' 67 | let paramValueString 68 | let paramPriceString 69 | let paramPriceCurrency 70 | let paramPriceCurrencyString 71 | let paramAddressString = 'Budapest, Alkotmány u. 20, 1054' 72 | let weeklyI55 73 | let weeklyI55Daily 74 | let found 75 | let trend 76 | let obj = null 77 | let mongoObj = null 78 | 79 | // @ I55 selectors 80 | const weeklyI55Selector = '#szoszok' 81 | const weeklyI55SelectorFallback = '.userContent' 82 | 83 | try { 84 | await page.goto(paramUrl, { waituntil: 'domcontentloaded', timeout: 0 }) 85 | weeklyI55 = await page.evaluate(el => el.textContent, (await page.$$(weeklyI55Selector))[0]) 86 | weeklyI55Daily = weeklyI55.match(/levesek([\s\S]*?)ebédelj/gi) 87 | // @ I55 price catch 88 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(weeklyI55) 89 | trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 90 | paramPriceString = price 91 | paramPriceCurrency = priceCurrency 92 | paramPriceCurrencyString = priceCurrencyStr + trend 93 | 94 | found = await dateCatcher.dateCatcher(weeklyI55, true) // @ I55 catch date 95 | if (found === true) { 96 | paramValueString = await stringValueCleaner.stringValueCleaner(weeklyI55Daily, false) 97 | paramValueString = paramValueString.replace(/\(\)/g, '').replace(/\n/, ' ') // to be moved to stringValueCleaner module later! 98 | paramValueString += '\n_Rövidítések: Gm — gluténmentes, Lm — laktózmentes_' 99 | 100 | // fallback on facebook page 101 | } else { 102 | await page.goto(paramUrlFallback, { waituntil: 'domcontentloaded', timeout: 0 }) 103 | forlabel: for (let i = 0; i < 10; i++) { 104 | weeklyI55 = await page.evaluate(el => el.textContent, (await page.$$(weeklyI55SelectorFallback))[i]) 105 | if (weeklyI55.match(/levesek([\s\S]*?)ebédelj/gi)) { 106 | weeklyI55Daily = weeklyI55.match(/levesek([\s\S]*?)ebédelj/gi) 107 | // @ I55 price catch 108 | let { price, priceCurrencyStr, priceCurrency } = await priceCatcher.priceCatcher(weeklyI55, 1) 109 | trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 110 | paramPriceString = price 111 | paramPriceCurrency = priceCurrency 112 | paramPriceCurrencyString = priceCurrencyStr + trend 113 | found = await dateCatcher.dateCatcher(weeklyI55, true) // @ I55 catch date 114 | if (found === true) { 115 | paramValueString = await stringValueCleaner.stringValueCleaner(weeklyI55Daily, false) 116 | paramValueString = paramValueString.replace(/\(\)/g, '').replace(/\n/, ' ') // to be moved to stringValueCleaner module later! 117 | break forlabel 118 | } else { 119 | paramValueString = 'menu is outdated!' 120 | } 121 | } 122 | } 123 | } 124 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 125 | console.log(paramValueString) 126 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 127 | 128 | // @ I55 object 129 | obj = new RestaurantMenuOutput( 130 | paramColor, 131 | paramTitleString, 132 | paramUrl, 133 | paramIcon, 134 | paramValueString, 135 | paramPriceString, 136 | paramPriceCurrency, 137 | paramPriceCurrencyString, 138 | paramAddressString 139 | ) 140 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 141 | if (objectDecider.objectDecider(paramValueString)) { 142 | finalJSON.attachments.push(obj) 143 | finalMongoJSON.push(mongoObj) 144 | } 145 | } catch (e) { 146 | console.error(e) 147 | } 148 | await page.goto('about:blank') 149 | await page.close() 150 | await browser.disconnect() 151 | } 152 | module.exports.scraper = scraper 153 | -------------------------------------------------------------------------------- /scrapers/ocrFacebookImage.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const priceCatcher = require('./../lib/priceCatcher') 29 | const priceCompareToDb = require('./../lib/priceCompareToDb') 30 | const stringValueCleaner = require('./../lib/stringValueCleaner') 31 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 32 | const today = require('./../src/date').date.today 33 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 34 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 35 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 36 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 37 | // const fs = require('fs') 38 | 39 | /* 40 | * @paramStartLine : selects custom range on the matching regex 41 | * @paramEndline : selects custom range on the matching regex 42 | * @paramZoomIn : in case of extremely small letters this param enables reading full size images 43 | */ 44 | 45 | // @ {RESTAURANT}s with only facebook image menus 46 | async function ocrFacebookImage( 47 | paramColor, 48 | paramTitleString, 49 | paramUrl, 50 | paramIcon, 51 | paramAddressString, 52 | paramDaysRegexArray, 53 | paramMenuHandleRegex 54 | ) { 55 | const browser = await puppeteer.connect({ browserWSEndpoint }) 56 | const page = await browser.newPage() 57 | 58 | let paramValueString 59 | let paramPriceString 60 | let paramPriceCurrency 61 | let paramPriceCurrencyString 62 | let restaurantDaysRegex = paramDaysRegexArray 63 | let imageAltArray = [] 64 | let parsedResult 65 | let obj = null 66 | let mongoObj = null 67 | 68 | try { 69 | await page.setUserAgent( 70 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36' 71 | ) 72 | await page.goto('https://www.facebook.com/login', { waitUntil: 'networkidle0' }) 73 | // @ {RESTAURANT} the hunt for the menu image src 74 | if ((await page.$$('input[name="email"]'))[0] !== null) { 75 | await page.type('input[name="email"]', process.env.FB_USERNAME) 76 | await page.type('input[name="pass"]', process.env.FB_PASSWORD) 77 | await page.waitForTimeout(500) 78 | await page.click('button[name="login"]') 79 | await page.waitForTimeout(2000) 80 | await page.goto(paramUrl, { waitUntil: 'networkidle0' }) 81 | // close cookie policy 82 | const cookieXPath = 83 | '//span[contains(text(), "Allow essential and optional cookies")]|//span[contains(text(), "és nem kötelező cookie-k engedélyezése")]' 84 | await page.waitForTimeout(2000) 85 | try { 86 | const cookieButton = await page.$x(cookieXPath) 87 | if (cookieButton.length > 0) await cookieButton[0].click() 88 | } catch (e) {} 89 | } 90 | // scroll down a bit for more relevant images 91 | await page.evaluate(() => window.scrollBy(0, window.innerHeight * 4)) 92 | imageAltArray = await page.$$eval('img', elems => elems.map(el => el.alt)) 93 | imageAltArray = imageAltArray.filter(el => 94 | el.match(/May be an image of text that says|Lehet, hogy egy kép erről/gi) 95 | ) 96 | if (imageAltArray.length < 1) { 97 | // if the images are irrelevant then scroll down a bit more 98 | await page.evaluate(() => window.scrollBy(0, window.innerHeight * 2)) 99 | imageAltArray = await page.$$eval('img', elems => elems.map(el => el.alt)) 100 | imageAltArray = imageAltArray.filter(el => 101 | el.match(/May be an image of text that says|Lehet, hogy egy kép erről/gi) 102 | ) 103 | } 104 | /* 105 | *kept for debug purposes; Note: fs module will be required 106 | *await page.screenshot({ path: __dirname + '/screen.png' }) 107 | *const screenBase64 = fs.readFileSync(__dirname + '/screen.png', 'base64') 108 | *console.log('data:image/png;base64, ' + screenBase64) 109 | */ 110 | } catch (e) { 111 | console.error(e) 112 | } 113 | 114 | // @ {RESTAURANT} OCR (using fb's own OCR in alt tags) 115 | forlabelRestaurant: for (let i = 0; i < imageAltArray.length; i++) { 116 | console.log(imageAltArray) 117 | try { 118 | parsedResult = imageAltArray[i] 119 | // @ {RESTAURANT} Monday-Friday 120 | if (parsedResult.match(paramMenuHandleRegex)) { 121 | // @ {RESTAURANT} price catch 122 | let { price, priceCurrencyStr, priceCurrency } = priceCatcher.priceCatcher(parsedResult) 123 | let trend = await priceCompareToDb.priceCompareToDb(paramTitleString, price) 124 | paramPriceString = price 125 | paramPriceCurrency = priceCurrency 126 | paramPriceCurrencyString = priceCurrencyStr + trend 127 | 128 | let restaurantDaily = parsedResult.match(restaurantDaysRegex[today]) 129 | if (restaurantDaily === null) { 130 | console.log( 131 | paramTitleString + ' parsed result is: ' + restaurantDaily + ' at ' + i + 'th matching image' 132 | ) 133 | continue forlabelRestaurant 134 | } 135 | 136 | ;[paramValueString] = parsedResult.match(restaurantDaysRegex[today]) 137 | // @ {RESTAURANT} clean string 138 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 139 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 140 | console.log(paramValueString) 141 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 142 | // @ {RESTAURANT} object 143 | obj = new RestaurantMenuOutput( 144 | paramColor, 145 | paramTitleString, 146 | paramUrl, 147 | paramIcon, 148 | paramValueString, 149 | paramPriceString, 150 | paramPriceCurrency, 151 | paramPriceCurrencyString, 152 | paramAddressString 153 | ) 154 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 155 | if (objectDecider.objectDecider(paramValueString)) { 156 | finalJSON.attachments.push(obj) 157 | finalMongoJSON.push(mongoObj) 158 | } 159 | 160 | break forlabelRestaurant 161 | } 162 | } catch (e) { 163 | console.error(e) 164 | } 165 | } 166 | await page.goto('about:blank') 167 | await page.close() 168 | await browser.disconnect() 169 | } 170 | module.exports.ocrFacebookImage = ocrFacebookImage 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Actions Status](https://github.com/theDavidBarton/puppeteer-daily-menu-scraper/workflows/CI/badge.svg)](https://github.com/theDavidBarton/puppeteer-daily-menu-scraper/actions) 2 | [![codecov](https://img.shields.io/codecov/c/github/theDavidBarton/puppeteer-daily-menu-scraper/master.svg)](https://codecov.io/gh/theDavidBarton/puppeteer-daily-menu-scraper) 3 | [![crocodile](https://img.shields.io/badge/crocodiles_in_the_basement-%F0%9F%90%8A_yes-orange.svg)](/lib) 4 | [![license](https://img.shields.io/github/license/theDavidBarton/puppeteer-daily-menu-scraper.svg)](/LICENSE.md) 5 | [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FtheDavidBarton%2Fpuppeteer-daily-menu-scraper.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FtheDavidBarton%2Fpuppeteer-daily-menu-scraper?ref=badge_shield) 6 | 7 | # Puppeteer Daily Menu Scraper 8 | 9 | ![scrape](https://github.com/theDavidBarton/puppeteer-daily-menu-scraper/workflows/scrape/badge.svg) (runs every workday at 11:40AM) 10 | 11 | A sandbox repository for **Puppeteer** (pptr), the NodeJs library made by GoogleChromeLabs to interact with webapps and browser components through headless Chrome. 12 | 13 | Currently the project's main js contains one headless Chrome instance with multiple async functions scraping daily and weekly menus (Monday to Friday) of restaurants from downtown Budapest (Hungary). 14 | 15 | So far the scrapers are diverse: **(1)** harvests facebook posts for images, then OCR their content; **(2)** OCR menus uploaded in jpg image and table format; **(3)** scrape regular restaurant websites and get content via DOM and **(4)** also scraping facebook post texts based on regex patterns. 16 | 17 | The final output is posted to slack via webhooks. 18 | 19 | ###### KEYWORDS: [puppeteer](https://github.com/search?q=puppeteer) | [OCR](https://github.com/search?q=ocr) | [web scraping](https://github.com/search?q=web+scraping) | [facebook scraping](https://github.com/search?q=facebook+scraping) | [webhooks](https://github.com/search?q=webhooks) 20 | 21 | ### What can you do here? 22 | 23 | - scrape daily menus and post the information with webhooks to slack; 24 | - scrape images from facebook posts and retrieve their content with OCR. 25 | 26 | ### Install packages 27 | 28 | `yarn install` the project. 29 | 30 | ### Environment variables 31 | 32 | I.) touch an `app.env` file (gitignored) in the root folder. Create your own OCR Space API key; request user for the daily_menu mongoDB (or create your own and replace uri in the code to fit); store the webhooks urls for slack per environment; and finally you will need facebook credentials for the facebook menu scraping. 33 | 34 | ```bash 35 | # create your API key here: https://ocr.space/ocrapi#free 36 | export OCR_API_KEY="******************" 37 | 38 | # mongoDb credentials 39 | export MONGO_USERNAME="**************" 40 | export MONGO_PASSWORD="**************" 41 | 42 | # slack webhooks 43 | export WEBHOOK_URL_TEST=https://hooks.slack.com/services/*********/*********/************************ 44 | export WEBHOOK_URL_PROD=https://hooks.slack.com/services/*********/*********/************************ 45 | 46 | # facebook credentials 47 | export FB_USERNAME="**************" 48 | export FB_PASSWORD="**************" 49 | ``` 50 | 51 | II.) source the created file to local environment variables (depending on your platform you'll need to find a method which lasts more than the current session!): 52 | 53 | ```bash 54 | $ source app.env 55 | ``` 56 | 57 | ### Run scrapers 58 | 59 | ```bash 60 | yarn scrape 61 | ``` 62 | 63 | or `npm run scrape` 64 | 65 | _Note:_ a [cron job](https://github.com/theDavidBarton/puppeteer-daily-menu-scraper/actions?query=workflow%3Ascrape) is set up via GitHub Actions to run the node script at every weekday 10:20AM UTC! `'20 10 * * 1-5'` 66 | 67 | ### Run scrapers in debug mode 68 | 69 | **I.)** `--debug` sends slack messages to WEBHOOK_URL_TEST so you are safe to do automated (or manual) e2e tests. 70 | 71 | ```bash 72 | yarn scrape --debug 73 | ``` 74 | 75 | **II.)** `--debug --date=[0-6]__YYYY.MM.DD.` For debug purposes you are able to run script with a 2nd argument like below, where 2 means: day is Tuesday (0: Sunday, 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday) and 2019.12.14. overrides the value of date.todayDotSeparated. You need to separate the two values by a double underscore '\_\_' ! 76 | 77 | ```bash 78 | yarn scrape --debug --date=2__2019.12.24. 79 | ``` 80 | 81 | ## API 82 | 83 | ```bash 84 | node ./src/server.js 85 | ``` 86 | 87 | or `yarn start` 88 | 89 | ### Endpoints 90 | 91 | - GET `/api/1/daily-menu/` => latest menu 92 | - GET `/api/1/daily-menu/{YYYY-MM-DD}` => menu of the selected date 93 | 94 | example: http://localhost:5000/api/1/daily-menu/2020-01-09 95 | 96 | **succesful response:** 97 | 98 | code: `200` 99 | 100 | ```json 101 | [ 102 | { 103 | "_id":"gdfgd55jk76k76k78l8jgdfsyc22", 104 | "timestamp":"2020.01.09.", 105 | "restaurant":"Karcsi Vendéglö", 106 | "price":"1100", 107 | "currency":"n/a", 108 | "menuString":"• Weekly offer: fokhagyma krémleves borzaska párolt rizzsel\n• Daily menu: korhely leves rozmaringos sertésragu leveszöldbab főzelék debrecenivel milánói sertésszelet" 109 | }, 110 | { 111 | "_id":"gdfgd55jk76k76k78l8jgdfsyc23", 112 | "timestamp":"2020.01.09.", 113 | "restaurant":"Bistro Suppé", 114 | "price":"1190", 115 | "currency":"HUF", 116 | "menuString":"Orly bundában sült csirkemellfilé, jázminrizzsel a mai menünk\nLevesek - Lengyel kolbászos burgonyaleves - Gyömbéres csirkeleves - Sütőtök krémleves... Főzelékek - Sólet - Kelkáposzta " 117 | }, 118 | { 119 | "_id":"gdfgd55jk76k76k78l8jgdfsyc24", 120 | "timestamp":"2020.01.09.", 121 | "restaurant":"Kamra Ételbár", 122 | "price":"1090", 123 | "currency":"HUF", 124 | "menuString":"• Daily menu: Zellerkrémleves, Bazsalikomos csirkés farfalle (1090.-Ft), Gombapaprikás tésztával (1100.-Ft), Rántott gomba tartárral körettel (1100.-Ft), Sajttal-sonkával töltött csibebatyu (1450.-Ft), Rántott csirkecomb petrezselymes burgonyával (1250.-Ft), Csőben sült fetás baconos csirkemell (1390.-Ft), Zúzapörkölt tarhonyával (1250.-Ft), Somlói galuska (650.-Ft), Feketeerdei sonkás gnocchi (1100.-Ft), Tejszines kapros piritott mogyorós csirkecsikok körettel (1250.-Ft), Gluténmentes főzelék: zöldborsó, tök, lencse (450.-Ft), Palermoi csirkemell paradicsomos rizzsel (1450.-Ft)" 125 | }, 126 | { 127 | "_id":"gdfgd55jk76k76k78l8jgdfsyc25", 128 | "timestamp":"2020.01.09.", 129 | "restaurant":"Fruccola (Arany Janos utca)", 130 | "price":"2190", 131 | "currency":"HUF", 132 | "menuString":"• Daily menu: Fűszres mogyoróvajas zöldségleves, Szárított paradicsomos füstölt sajtos csirkemell rolád, mediterrán tepsis burgonya" 133 | }, 134 | [...] 135 | ] 136 | ``` 137 | 138 | **error response:** 139 | 140 | code: `404` 141 | 142 | ```json 143 | { "error": "no menu for the selected date!" } 144 | ``` 145 | 146 | # Links 147 | 148 | [The home of Puppeteer](https://pptr.dev) 149 | 150 | [GitHub Puppeteer](https://github.com/GoogleChrome/puppeteer) 151 | 152 | # License 153 | 154 | [MIT](/LICENSE.md) 155 | 156 | Copyright (c) 2020 David Barton 157 | 158 | Permission is hereby granted, free of charge, to any person obtaining a copy 159 | of this software and associated documentation files (the "Software"), to deal 160 | in the Software without restriction, including without limitation the rights 161 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 162 | copies of the Software, and to permit persons to whom the Software is 163 | furnished to do so, subject to the following conditions: 164 | 165 | The above copyright notice and this permission notice shall be included in all 166 | copies or substantial portions of the Software. 167 | 168 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 169 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 170 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 171 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 172 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 173 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 174 | SOFTWARE. 175 | 176 | --- 177 | 178 | Dependency licenses: [NOTICE](/LICENSES.md) 179 | -------------------------------------------------------------------------------- /scrapers/nokedli.js: -------------------------------------------------------------------------------- 1 | /* 2 | * ___________ 3 | * MIT License 4 | * 5 | * Copyright (c) 2020 David Barton 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | const puppeteer = require('puppeteer') 27 | const objectDecider = require('./../lib/objectDecider') 28 | const ocrSpaceApiSimple = require('./../lib/ocrSpaceApiSimple') 29 | const stringValueCleaner = require('./../lib/stringValueCleaner') 30 | const browserWSEndpoint = require('./../src/dailyMenuScraper').browserWSEndpoint 31 | const today = require('./../src/date').date.today 32 | const finalJSON = require('./../src/dailyMenuScraper').finalJSON 33 | const finalMongoJSON = require('./../src/dailyMenuScraper').finalMongoJSON 34 | const RestaurantMenuOutput = require('./../src/restaurantMenuClasses').RestaurantMenuOutput 35 | const RestaurantMenuDb = require('./../src/restaurantMenuClasses').RestaurantMenuDb 36 | 37 | async function scraper() { 38 | const browser = await puppeteer.connect({ browserWSEndpoint }) 39 | const page = await browser.newPage() 40 | 41 | // abort all images, source: https://github.com/GoogleChrome/puppeteer/blob/master/examples/block-images.js 42 | await page.setRequestInterception(true) 43 | page.on('request', request => { 44 | if (request.resourceType() === 'image') { 45 | request.abort() 46 | } else { 47 | request.continue() 48 | } 49 | }) 50 | 51 | /* 52 | * @ NOKEDLI 53 | * ------------------------------------------ 54 | * contact info: 55 | * Address: Budapest, Weiner Leó u. 17, 1065 56 | * Phone: (20) 499 5832 57 | * ----------------------------------------- 58 | * imageSelector --> imageNokedliSelector 59 | * store src 60 | * trim thumbnail sub for normal sized image 61 | * download and reduce image size 62 | * OCR the table 63 | */ 64 | 65 | // @ NOKEDLI parameters 66 | let paramColor = '#f9c32c' 67 | let paramTitleString = 'Nokedli' 68 | let paramUrl = 'http://nokedlikifozde.hu/' 69 | let paramIcon = 70 | 'https://scontent.fbud1-1.fna.fbcdn.net/v/t1.0-1/p320x320/969066_507629642637360_22543675_n.jpg?_nc_cat=108&_nc_ht=scontent.fbud1-1.fna&oh=a2e8efd55605ba9b7b63553dc54c23ca&oe=5D6F4115' 71 | let paramAddressString = 'Budapest, Weiner Leó u. 17, 1065' 72 | let paramValueString 73 | let paramPriceString = 'n/a' 74 | let paramPriceCurrency = 'n/a' 75 | let paramPriceCurrencyString = '' 76 | let weeklyNokedli 77 | let parsedResult 78 | let obj = null 79 | let mongoObj = null 80 | 81 | // @ NOKEDLI selector 82 | const imageNokedliSelector = '.aligncenter' 83 | 84 | try { 85 | await page.goto(paramUrl, { waitUntil: 'networkidle0' }) 86 | // @ NOKEDLI weekly 87 | let imageSelector = imageNokedliSelector 88 | weeklyNokedli = await page.evaluate(el => el.src, await page.$(imageSelector)) 89 | weeklyNokedli = weeklyNokedli.replace('-300x212', '') 90 | } catch (e) { 91 | console.error(e) 92 | } 93 | 94 | // @ NOKEDLI compress image with the great Images.weserv.nl 💚 API https://images.weserv.nl/#image-api 95 | const imageURL = 'https://images.weserv.nl/?url=' + weeklyNokedli + '&q=60' 96 | 97 | // @ NOKEDLI OCR 98 | const options = { 99 | method: 'POST', 100 | url: 'https://api.ocr.space/parse/image', 101 | headers: { 102 | apikey: process.env.OCR_API_KEY 103 | }, 104 | formData: { 105 | language: 'hun', 106 | isOverlayRequired: 'true', 107 | url: imageURL, 108 | scale: 'true', 109 | isTable: 'true' 110 | } 111 | } 112 | try { 113 | parsedResult = await ocrSpaceApiSimple.ocrSpaceApiSimple(options) 114 | 115 | let textOverlayLinesCount = parsedResult.TextOverlay.Lines.length // text group count 116 | let nokedliMonday = [] 117 | let nokedliMondayStr = [] 118 | let nokedliTuesday = [] 119 | let nokedliTuesdayStr = [] 120 | let nokedliWednesday = [] 121 | let nokedliWednesdayStr = [] 122 | let nokedliThursday = [] 123 | let nokedliThursdayStr = [] 124 | let nokedliFriday = [] 125 | let nokedliFridayStr = [] 126 | 127 | // checks word coordinates against a predefined map of the table 128 | for (let i = 0; i < textOverlayLinesCount; i++) { 129 | let textOverlayWordsCount = parsedResult.TextOverlay.Lines[i].Words.length 130 | for (let j = 0; j < textOverlayWordsCount; j++) { 131 | let wordLeft = parsedResult.TextOverlay.Lines[i].Words[0].Left 132 | let wordTop = parsedResult.TextOverlay.Lines[i].Words[0].Top 133 | let wordText = parsedResult.TextOverlay.Lines[i].Words[j].WordText 134 | 135 | if (wordTop > 520 && wordTop < 1930) { 136 | monday: if (wordLeft > 780 && wordLeft < 980) { 137 | nokedliMonday.push(wordText) 138 | nokedliMondayStr = nokedliMonday 139 | .join(' ') 140 | .split(/(?= [A-ZÁÍŰŐÜÖÚÓÉ])/g) 141 | .map(trimThemAll => trimThemAll.trim()) 142 | } 143 | tuesday: if (wordLeft > 1310 && wordLeft < 1546) { 144 | nokedliTuesday.push(wordText) 145 | nokedliTuesdayStr = nokedliTuesday 146 | .join(' ') 147 | .split(/(?= [A-ZÁÍŰŐÜÖÚÓÉ])/g) 148 | .map(trimThemAll => trimThemAll.trim()) 149 | } 150 | wednesday: if (wordLeft > 1815 && wordLeft < 2090) { 151 | nokedliWednesday.push(wordText) 152 | nokedliWednesdayStr = nokedliWednesday 153 | .join(' ') 154 | .split(/(?= [A-ZÁÍŰŐÜÖÚÓÉ])/g) 155 | .map(trimThemAll => trimThemAll.trim()) 156 | } 157 | thursday: if (wordLeft > 2345 && wordLeft < 2620) { 158 | nokedliThursday.push(wordText) 159 | nokedliThursdayStr = nokedliThursday 160 | .join(' ') 161 | .split(/(?= [A-ZÁÍŰŐÜÖÚÓÉ])/g) 162 | .map(trimThemAll => trimThemAll.trim()) 163 | } 164 | friday: if (wordLeft > 2880 && wordLeft < 3110) { 165 | nokedliFriday.push(wordText) 166 | nokedliFridayStr = nokedliFriday 167 | .join(' ') 168 | .split(/(?= [A-ZÁÍŰŐÜÖÚÓÉ])/g) 169 | .map(trimThemAll => trimThemAll.trim()) 170 | } 171 | } 172 | } 173 | } 174 | console.log('*' + paramTitleString + '* \n' + '-'.repeat(paramTitleString.length)) 175 | 176 | switch (today) { 177 | case 1: 178 | paramValueString = nokedliMondayStr.join(', ') 179 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 180 | break 181 | case 2: 182 | paramValueString = nokedliTuesdayStr.join(', ') 183 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 184 | break 185 | case 3: 186 | paramValueString = nokedliWednesdayStr.join(', ') 187 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 188 | break 189 | case 4: 190 | paramValueString = nokedliThursdayStr.join(', ') 191 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 192 | break 193 | case 5: 194 | paramValueString = nokedliFridayStr.join(', ') 195 | paramValueString = '• Daily menu: ' + (await stringValueCleaner.stringValueCleaner(paramValueString, true)) 196 | break 197 | default: 198 | paramValueString = 'weekend work, eh?\n' 199 | } 200 | console.log(paramValueString) 201 | console.log(paramPriceString + paramPriceCurrencyString + '\n') 202 | 203 | // @ NOKEDLI object 204 | obj = new RestaurantMenuOutput( 205 | paramColor, 206 | paramTitleString, 207 | paramUrl, 208 | paramIcon, 209 | paramValueString, 210 | paramPriceString, 211 | paramPriceCurrency, 212 | paramPriceCurrencyString, 213 | paramAddressString 214 | ) 215 | mongoObj = new RestaurantMenuDb(paramTitleString, paramPriceString, paramPriceCurrency, paramValueString) 216 | if (objectDecider.objectDecider(paramValueString)) { 217 | finalJSON.attachments.push(obj) 218 | finalMongoJSON.push(mongoObj) 219 | } 220 | } catch (e) { 221 | console.error(e) 222 | } 223 | await page.goto('about:blank') 224 | await page.close() 225 | await browser.disconnect() 226 | } 227 | module.exports.scraper = scraper 228 | --------------------------------------------------------------------------------