├── .gitignore ├── apify.json ├── test ├── helpers │ └── chai.js └── utility_spec.js ├── .editorconfig ├── .eslintrc ├── jsconfig.json ├── package.json ├── Dockerfile ├── CHANGELOG.md ├── src ├── consts.js ├── main.js ├── subtitles.js ├── crawler_utils.js └── utility.js ├── .actor └── actor.json ├── INPUT_SCHEMA.json ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | apify_storage 3 | node_modules 4 | logs 5 | *.log 6 | .DS_Store 7 | package-lock.json 8 | -------------------------------------------------------------------------------- /apify.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "youtube-search-scraper", 3 | "version": "0.0", 4 | "buildTag": "latest", 5 | "env": { 6 | }, 7 | "template": "puppeteer_crawler" 8 | } 9 | -------------------------------------------------------------------------------- /test/helpers/chai.js: -------------------------------------------------------------------------------- 1 | const chai = require('chai'); 2 | 3 | chai.config.includeStack = true; 4 | 5 | global.expect = chai.expect; 6 | global.AssertionError = chai.AssertionError; 7 | global.Assertion = chai.Assertion; 8 | global.assert = chai.assert; 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | end_of_line = lf 10 | # editorconfig-tools is unable to ignore longs strings or urls 11 | max_line_length = null 12 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify", 3 | "env": { 4 | "mocha": true 5 | }, 6 | "parserOptions": { 7 | "ecmaVersion": 2020 8 | }, 9 | "rules": { 10 | "no-throw-literal": "off", 11 | "no-unused-vars": [ 12 | "error", 13 | { 14 | "varsIgnorePattern": "should|expect" 15 | } 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2018", 4 | "module": "commonJS", 5 | "lib": [ 6 | "dom", 7 | "dom.iterable", 8 | "es5", 9 | "es6", 10 | "es2018", 11 | "es2019.array", 12 | "es2019.object", 13 | "es2020.string" 14 | ], 15 | "strict": true, 16 | "checkJs": true, 17 | "alwaysStrict": true, 18 | "moduleResolution": "node", 19 | "esModuleInterop": true, 20 | "noImplicitAny": true, 21 | "noImplicitReturns": false, 22 | "allowSyntheticDefaultImports": false 23 | }, 24 | "include": [ 25 | "./src/*.js" 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "actor-youtube-scraper", 3 | "version": "0.0.1", 4 | "description": "Actor to scrape youtube videos", 5 | "keywords": [ 6 | "apify", 7 | "apifier", 8 | "crawler", 9 | "search", 10 | "youtube", 11 | "pupetteer" 12 | ], 13 | "dependencies": { 14 | "apify": "^2.3.0", 15 | "moment": "^2.29.1", 16 | "node-fetch": "^2.6.1", 17 | "puppeteer": "*" 18 | }, 19 | "devDependencies": { 20 | "@apify/eslint-config": "^0.1.4", 21 | "@types/puppeteer": "^5.4.2", 22 | "chai": "^4.2.0", 23 | "eslint": "7.17.0", 24 | "mocha": "^8.2.1", 25 | "sinon": "^9.2.3" 26 | }, 27 | "scripts": { 28 | "start": "node src/main.js", 29 | "test": "mocha test/utility_spec.js", 30 | "apify": "apify run" 31 | }, 32 | "author": "bernardo@sonkomail.com", 33 | "contributors": [ 34 | "Bernard Okoth " 35 | ], 36 | "license": "Apache-2.0", 37 | "homepage": "https://github.com/bernardro/actor-youtube-scraper", 38 | "repository": { 39 | "type": "git", 40 | "url": "git+https://github.com/bernardro/actor-youtube-scraper" 41 | }, 42 | "bugs": { 43 | "url": "https://github.com/bernardro/actor-youtube-scraper/issues" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # First, specify the base Docker image. You can read more about 2 | # the available images at https://sdk.apify.com/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node-puppeteer-chrome:16 5 | 6 | # Second, copy just package.json and package-lock.json since it should be 7 | # the only file that affects "npm install" in the next step, to speed up the build 8 | COPY package*.json ./ 9 | 10 | # Install NPM packages, skip optional and development dependencies to 11 | # keep the image small. Avoid logging too much and print the dependency 12 | # tree for debugging 13 | RUN npm --quiet set progress=false \ 14 | && npm install --only=prod --no-optional \ 15 | && echo "Installed NPM packages:" \ 16 | && (npm list --only=prod --no-optional --all || true) \ 17 | && echo "Node.js version:" \ 18 | && node --version \ 19 | && echo "NPM version:" \ 20 | && npm --version 21 | 22 | # Next, copy the remaining files and directories with the source code. 23 | # Since we do this after NPM install, quick build will be really fast 24 | # for most source file changes. 25 | COPY . ./ 26 | 27 | # Optionally, specify how to launch the source code of your actor. 28 | # By default, Apify's base Docker images define the CMD instruction 29 | # that runs the Node.js source code using the command specified 30 | # in the "scripts.start" section of the package.json file. 31 | # In short, the instruction looks something like this: 32 | # 33 | # CMD npm start -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 2022-07-20 2 | *Fixes* 3 | - Correctly handle videos with comments turned off. 4 | - Add `commentsTurnedOff` to output. 5 | 6 | ## 2022-06-10 7 | *Fixes*: 8 | - Channel page without `/watch` selector 9 | 10 | ## 2021-09-15 11 | *Features* 12 | - Add possibility to scrape video comments. See `maxComments` input field. 13 | 14 | 2021-06-16 15 | *Features* 16 | - Revamped subtitles downloading - added possibility to download all available subtitles 17 | (availability defined by languages) and to prefer automatically generated subtitles before the user generated 18 | ones. 19 | 20 | 21 | 2021-06-14 22 | *Features*: 23 | - Add subtitle type to output (extendedOutputFunction). **Note**: You must set `downloadSubtitles` variable to `true` for this 24 | feature to take effect. 25 | 26 | 2021-06-11 27 | *Features*: 28 | - Subtitles are now downloadable (saved to KeyValueStore as `videoID_languageCode`) 29 | 30 | 2021-05-21 31 | *Features*: 32 | - Update SDK 33 | 34 | *Fixes* 35 | - Random zero results when searching 36 | - Click consent dialog 37 | 38 | 2021-04-14 39 | *Fixes* 40 | - Fixed changed selector that completely prevented the scrape 41 | 42 | 2021-03-21 43 | *Features*: 44 | - Updated SDK version for session pool changes 45 | - Add `handlePageTimeoutSecs` parameter to INPUT_SCHEMA 46 | 47 | 48 | 2021-03-15 49 | *Fixes:* 50 | - Fixed selector causing no data scraped 51 | - Removed stealth causing issues with new layout 52 | 53 | 2020-09-27 54 | - Increased waiting timeouts to better handle concurrency 55 | - Added saving screenshots on errors 56 | - Better handling of Captchas, a page is automatically retried and the browser is restarted with a new proxy 57 | - `verboseLog` is off by default 58 | - Added info how many videos were enqueued and overall better logging 59 | -------------------------------------------------------------------------------- /src/consts.js: -------------------------------------------------------------------------------- 1 | exports.DELAY = { 2 | KEY_PRESS: { MIN: 5, MAX: 25 }, 3 | BTWN_KEY_PRESS: { MIN: 45, MAX: 375 }, 4 | MOUSE_CLICK: { MIN: 40, MAX: 150 }, 5 | HUMAN_PAUSE: { MIN: 300, MAX: 800 }, 6 | START_LOADING_MORE_VIDEOS: 3000, 7 | }; 8 | 9 | exports.MOUSE_STEPS = 5; 10 | 11 | // 'document', 'image', 'xhr', 'script', 'stylesheet', 'font', 'other', 'manifest' 12 | exports.MEDIA_TYPES = ['image']; 13 | 14 | exports.LABELS = { 15 | DETAIL: 'DETAIL', 16 | MASTER: 'MASTER', 17 | CHANNEL: 'CHANNEL', 18 | SEARCH: 'SEARCH', 19 | }; 20 | 21 | exports.SELECTORS = { 22 | SEARCH: { 23 | searchBox: 'input#search', 24 | toggleFilterMenu: '#button[aria-label="Search filters"]', 25 | filterBtnsXp: '//ytd-search-filter-renderer/a/div/yt-formatted-string', 26 | youtubeVideosSection: 'ytd-item-section-renderer', 27 | youtubeVideosRenderer: 'ytd-video-renderer,ytd-grid-video-renderer', // grid is for channels 28 | url: 'a[href^="/watch"]', 29 | videoTitle: '#video-title', 30 | channelNameText: '#channel-name #text-container', // multiple, get first 31 | subscriberCount: '#subscriber-count', 32 | canonicalUrl: 'link[rel="canonical"]', 33 | simplifiedResultVideoTitle: '#video-title', 34 | simplifiedResultDurationText: '#text', 35 | simplifiedResultChannelName: '#channel-info > #channel-name', 36 | simlifiedResultChannelUrl: '#channel-info > a', 37 | simplifiedResultViewCount: '#metadata-line > span:nth-child(1)', 38 | simplifiedResultDate: '#metadata-line > span:nth-child(2)', 39 | }, 40 | VIDEO: { 41 | titleXp: '//ytd-video-primary-info-renderer/div/h1/yt-formatted-string', 42 | viewCountXp: '//*[@id="count"]/ytd-video-view-count-renderer/span[1]', 43 | uploadDateXp: '//ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string', 44 | likesXp: "//ytd-menu-renderer/div/ytd-toggle-button-renderer[1]/a/*[@id='text']", 45 | dislikesXp: "//ytd-menu-renderer/div/ytd-toggle-button-renderer[2]/a/*[@id='text']", 46 | channelXp: '//ytd-channel-name/div/div/yt-formatted-string/a', 47 | subscribersXp: "//*[@id='owner-sub-count']", 48 | descriptionXp: '//ytd-expander/div/div/yt-formatted-string', 49 | durationSlctr: '#movie_player span.ytp-time-duration', 50 | commentsSlctr: '.count-text', 51 | }, 52 | }; 53 | -------------------------------------------------------------------------------- /.actor/actor.json: -------------------------------------------------------------------------------- 1 | { 2 | "actorSpecification": 1, 3 | "name": "youtube-scraper", 4 | "title": "Youtube Scraper", 5 | "description": "Scrapes Youtube", 6 | "version": "0.0.1", 7 | "storages": { 8 | "dataset": { 9 | "actorSpecification": 1, 10 | "title": "Youtube Scraper", 11 | "description": "Too see all scraped properties, export the whole dataset or select All fields instead of Overview", 12 | "views": { 13 | "overview": { 14 | "title": "Overview", 15 | "description": "", 16 | "transformation": { 17 | "fields": [ 18 | "title", 19 | "id", 20 | "url", 21 | "viewCount", 22 | "date", 23 | "likes", 24 | "channelName", 25 | "channelUrl", 26 | "numberOfSubscribers", 27 | "duration" 28 | ] 29 | }, 30 | "display": { 31 | "component": "table", 32 | "columns": [ 33 | { 34 | "label": "Title", 35 | "format": "text", 36 | "field": "title" 37 | }, 38 | { 39 | "label": "URL", 40 | "format": "text", 41 | "field": "url" 42 | }, 43 | { 44 | "label": "Number of views", 45 | "format": "text", 46 | "field": "viewCount" 47 | }, 48 | { 49 | "label": "Date", 50 | "format": "text", 51 | "field": "date" 52 | }, 53 | { 54 | "label": "Number of likes", 55 | "format": "text", 56 | "field": "likes" 57 | }, 58 | { 59 | "label": "Channel name", 60 | "format": "text", 61 | "field": "channelName" 62 | }, 63 | { 64 | "label": "Channel URL", 65 | "format": "text", 66 | "field": "channelUrl" 67 | }, 68 | { 69 | "label": "Number of subscribers", 70 | "format": "text", 71 | "field": "numberOfSubscribers" 72 | }, 73 | { 74 | "label": "duration", 75 | "format": "text", 76 | "field": "duration" 77 | } 78 | ] 79 | } 80 | } 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Input schema for the youtube-search-scraper actor", 3 | "type": "object", 4 | "schemaVersion": 1, 5 | "properties": { 6 | "searchKeywords": { 7 | "title": "Search keywords", 8 | "type": "string", 9 | "description": "What to search Youtube for", 10 | "editor": "textfield", 11 | "prefill": "Madonna", 12 | "maxLength" : 250 13 | }, 14 | "maxResults": { 15 | "title": "Maximum search results", 16 | "type": "integer", 17 | "description": "Limit number of videos to crawl;
for 'No limits' empty the field", 18 | "minimum": 1, 19 | "maximum": 999999, 20 | "prefill": 50, 21 | "unit": "Videos", 22 | "nullable": true 23 | }, 24 | "startUrls": { 25 | "title": "Direct URLs", 26 | "type": "array", 27 | "description": "Provide some direct Youtube urls, like channel urls or search urls. When provided, it disables search.", 28 | "default": [], 29 | "editor": "requestListSources", 30 | "sectionCaption": "Direct URLs" 31 | }, 32 | "simplifiedInformation": { 33 | "title" : "Get simplified information from a channel", 34 | "type": "boolean", 35 | "default": false, 36 | "description" : "If set to true, we will only get basic video information from the channel. You have to provide a channel Url as StartUrl" 37 | }, 38 | "maxComments": { 39 | "title" : "Max comments", 40 | "type" : "integer", 41 | "description" : "Limit the number of comments that will get scraped. Scraping comments requires scrolling and takes time. 0 or empty means we will not scrape any comments at all. ", 42 | "default": 0, 43 | "sectionCaption": "Comments & Subtitles", 44 | "sectionDescription": "Use this if you want to download comments and / or video subtitles as well." 45 | }, 46 | "downloadSubtitles": { 47 | "title" : "Download subtitles", 48 | "type": "boolean", 49 | "description" : "If set to true, we will also download subtitles for given video and convert it to SRT format" 50 | }, 51 | "saveSubsToKVS": { 52 | "title" : "Save downloaded subtitles to Key Value Store", 53 | "type": "boolean", 54 | "description" : "If set to true, we will save downloaded (and converted to .srt format) subtitles to Key Value Store in format { subtitles: srt_text }. 'downloadSubtitles' must be set to true for this take an effect" 55 | }, 56 | "subtitlesLanguage": { 57 | "title" : "Download subtitles in selected language", 58 | "type": "string", 59 | "description" : "Language to download subtitles in. 'downloadSubtitles' must be set to true for this take an effect", 60 | "editor": "select", 61 | "default" : "en", 62 | "enum" : ["en", "de", "es", "fr", "it", "ja", "ko", "nl", "pt", "ru"], 63 | "enumTitles" : ["English", "German", "Spanish", "French", "Italian", "Japanese", "Korean", "Holland", "Portuguese", "Russian"] 64 | }, 65 | "preferAutoGeneratedSubtitles": { 66 | "title": "Prefer automatically generated subtitles before the user generated ones.", 67 | "description": "If set to `true` and language is provided, we download automatically generated subtitles rather then user ones. If no language is provided, this settings takes no effect", 68 | "type": "boolean" 69 | }, 70 | "extendOutputFunction": { 71 | "title": "Extend Output Function", 72 | "description": "Add or remove properties on the output object or omit the output returning null", 73 | "type": "string", 74 | "default": "", 75 | "prefill": "async ({ data, item, page, request, customData }) => {\n return item; \n}", 76 | "editor": "javascript", 77 | "sectionCaption": "Extend scraper functionality", 78 | "sectionDescription": "You can change the output of the items for your dataset here, or add additional behavior for" 79 | }, 80 | "extendScraperFunction": { 81 | "title": "Extend Scraper Function", 82 | "description": "Advanced function that allows you to extend the default scraper functionality, allowing you to manually perform actions on the page", 83 | "type": "string", 84 | "default": "", 85 | "prefill": "async ({ page, request, requestQueue, customData, Apify, extendOutputFunction }) => {\n \n}", 86 | "editor": "javascript" 87 | }, 88 | "customData": { 89 | "title": "Custom data", 90 | "description": "Any data that you want to have available inside the Extend Output/Scraper Function", 91 | "default": {}, 92 | "prefill": {}, 93 | "type": "object", 94 | "editor": "json" 95 | }, 96 | "handlePageTimeoutSecs": { 97 | "title": "Handle page timeout", 98 | "description": "Set the handlePageTimeout seconds", 99 | "default": 3600, 100 | "prefill": 3600, 101 | "type": "integer", 102 | "editor": "number" 103 | }, 104 | "proxyConfiguration": { 105 | "title": "Proxy configuration", 106 | "type": "object", 107 | "editor": "proxy", 108 | "description": "The best option is usually Automatic proxy. But you can also use your own proxies or no proxy", 109 | "prefill": { 110 | "useApifyProxy": true 111 | }, 112 | "default": { 113 | "useApifyProxy": true 114 | }, 115 | "sectionCaption": "Proxy and browser configuration" 116 | }, 117 | "verboseLog": { 118 | "title": "Verbose log", 119 | "type": "boolean", 120 | "description" : "If set to true, we will save downloaded (and converted to .srt format) subtitles to Key Value Store in format { subtitles: srt_text, language: languageCode, type:autoOrUserGenerated }. 'downloadSubtitles' must be set to true for this take an effect" 121 | } 122 | }, 123 | "required": [ 124 | "proxyConfiguration" 125 | ] 126 | } 127 | -------------------------------------------------------------------------------- /src/main.js: -------------------------------------------------------------------------------- 1 | const Apify = require('apify'); 2 | 3 | const utils = require('./utility'); 4 | const crawler = require('./crawler_utils'); 5 | 6 | const { log, puppeteer } = Apify.utils; 7 | 8 | Apify.main(async () => { 9 | /** 10 | * @type {any} 11 | */ 12 | const input = await Apify.getInput(); 13 | 14 | const { 15 | verboseLog, 16 | startUrls = [], 17 | proxyConfiguration, 18 | searchKeywords, 19 | maxResults, 20 | simplifiedInformation = false, 21 | // postsFromDate, 22 | handlePageTimeoutSecs = 3600, 23 | downloadSubtitles = false, 24 | saveSubsToKVS: saveSubtitlesToKVS = false, 25 | subtitlesLanguage = null, 26 | preferAutoGeneratedSubtitles = false, 27 | maxComments = 0, 28 | } = input; 29 | if (verboseLog) { 30 | log.setLevel(log.LEVELS.DEBUG); 31 | } 32 | const kvStore = await Apify.openKeyValueStore(); 33 | const requestQueue = await Apify.openRequestQueue(); 34 | const proxyConfig = await utils.proxyConfiguration({ 35 | proxyConfig: proxyConfiguration, 36 | }); 37 | 38 | if (!searchKeywords && (!startUrls || !startUrls.length)) { 39 | throw new Error('You need to provide either searchKeywords or startUrls as input'); 40 | } 41 | 42 | if (startUrls && startUrls.length) { 43 | log.info('Starting scraper with startUrls, ignoring searchKeywords'); 44 | 45 | const parseUrls = await Apify.openRequestList(null, startUrls); 46 | let req; 47 | // eslint-disable-next-line no-cond-assign 48 | while (req = await parseUrls.fetchNextRequest()) { 49 | // need to parse for requestsFromUrl first then categorize by path 50 | const label = utils.categorizeUrl(req.url); 51 | const pUrl = new URL(req.url); 52 | 53 | if (label === 'CHANNEL' && !pUrl.pathname.includes('/videos')) { 54 | pUrl.pathname = `${pUrl.pathname.split('/').filter((s) => s).join('/')}/videos`; 55 | req.url = pUrl.toString(); 56 | } 57 | await requestQueue.addRequest({ 58 | url: req.url, 59 | userData: { 60 | label, 61 | }, 62 | }); 63 | } 64 | } else if (searchKeywords) { 65 | // add starting url 66 | log.info('Starting scraper with a search keyword'); 67 | 68 | for (let searchKeyword of searchKeywords.split(',')) { 69 | searchKeyword = `${searchKeyword}`.trim(); 70 | 71 | if (searchKeyword) { 72 | await requestQueue.addRequest({ 73 | url: 'https://www.youtube.com/', 74 | uniqueKey: `SEARCH-${searchKeyword}`, 75 | userData: { 76 | label: 'MASTER', 77 | search: searchKeyword.trim(), 78 | }, 79 | }); 80 | } 81 | } 82 | } 83 | 84 | const extendOutputFunction = await utils.extendFunction({ 85 | input, 86 | key: 'extendOutputFunction', 87 | output: async (data) => { 88 | await Apify.pushData(data); 89 | }, 90 | helpers: {}, 91 | }); 92 | 93 | const extendScraperFunction = await utils.extendFunction({ 94 | input, 95 | key: 'extendScraperFunction', 96 | output: async () => {}, // no-op for page interaction 97 | helpers: { 98 | requestQueue, 99 | extendOutputFunction, 100 | }, 101 | }); 102 | 103 | const pptrCrawler = new Apify.PuppeteerCrawler({ 104 | requestQueue, 105 | browserPoolOptions: { 106 | maxOpenPagesPerBrowser: 1, 107 | }, 108 | useSessionPool: true, 109 | proxyConfiguration: proxyConfig, 110 | preNavigationHooks: [ 111 | async ({ page }, gotoOptions) => { 112 | await puppeteer.blockRequests(page, { 113 | urlPatterns: [ 114 | '.mp4', 115 | '.webp', 116 | '.jpeg', 117 | '.jpg', 118 | '.gif', 119 | '.svg', 120 | '.ico', 121 | '.png', 122 | 'google-analytics', 123 | 'doubleclick.net', 124 | 'googletagmanager', 125 | '/videoplayback', 126 | '/adview', 127 | '/stats/ads', 128 | '/stats/watchtime', 129 | '/stats/qoe', 130 | '/log_event', 131 | ], 132 | }); 133 | 134 | gotoOptions.waitUntil = 'networkidle2'; 135 | }, 136 | ], 137 | handlePageTimeoutSecs, 138 | handleFailedRequestFunction: async ({ request }) => { 139 | Apify.utils.log.error(`Request ${request.url} failed too many times`); 140 | 141 | await Apify.pushData({ 142 | '#debug': Apify.utils.createRequestDebugInfo(request), 143 | }); 144 | }, 145 | handlePageFunction: async ({ page, request, session, response }) => { 146 | // no-output function 147 | await extendScraperFunction(undefined, { 148 | page, 149 | request, 150 | }); 151 | 152 | const hasCaptcha = await page.$('.g-recaptcha'); 153 | if (hasCaptcha) { 154 | session.retire(); 155 | throw 'Got captcha, page will be retried. If this happens often, consider increasing number of proxies'; 156 | } 157 | 158 | if (utils.isErrorStatusCode(response.status())) { 159 | session.retire(); 160 | throw `Response status is: ${response.status()} msg: ${response.statusText()}`; 161 | } 162 | 163 | if (page.url().includes('consent')) { 164 | log.info('Clicking consent dialog'); 165 | 166 | await Promise.all([ 167 | page.$eval('form[action*="consent"]', (el) => { 168 | el.querySelector('button')?.click(); 169 | }), 170 | page.waitForNavigation({ waitUntil: 'networkidle2' }), 171 | ]); 172 | 173 | session.retire(); 174 | } 175 | 176 | if (await page.$('.yt-upsell-dialog-renderer')) { 177 | // this dialog steal focus, so need to click it 178 | await page.evaluate(async () => { 179 | const noThanks = document.querySelectorAll('.yt-upsell-dialog-renderer [role="button"]'); 180 | 181 | for (const button of noThanks) { 182 | if (button.textContent && button.textContent.includes('No thanks')) { 183 | button.click(); 184 | break; 185 | } 186 | } 187 | }); 188 | } 189 | 190 | switch (request.userData.label) { 191 | case 'CHANNEL': 192 | case 'SEARCH': 193 | case 'MASTER': { 194 | await crawler.handleMaster({ page, requestQueue, searchKeywords, maxResults, request, simplifiedInformation, input }); 195 | break; 196 | } 197 | case 'DETAIL': { 198 | await crawler.handleDetail( 199 | page, 200 | request, 201 | extendOutputFunction, 202 | { 203 | doDownload: downloadSubtitles, 204 | saveToKVS: saveSubtitlesToKVS, 205 | language: subtitlesLanguage, 206 | kvs: kvStore, 207 | preferAutoGenerated: preferAutoGeneratedSubtitles, 208 | }, 209 | maxComments, 210 | ); 211 | break; 212 | } 213 | default: throw new Error('Unknown request label in handlePageFunction'); 214 | } 215 | }, 216 | }); 217 | await pptrCrawler.run(); 218 | }); 219 | -------------------------------------------------------------------------------- /src/subtitles.js: -------------------------------------------------------------------------------- 1 | const Apify = require('apify'); 2 | const {log} = Apify.utils; 3 | const fetch = require('node-fetch'); 4 | 5 | class SrtConvert { 6 | static TYPE_AUTO_GENERATED = 'auto_generated'; 7 | static TYPE_USER_GENERATED = 'user_generated'; 8 | 9 | constructor(srtJson, lang, type = SrtConvert.TYPE_AUTO_GENERATED) { 10 | this._json = srtJson; 11 | this.language = lang; 12 | this.type = type; 13 | 14 | this.srt = null; 15 | 16 | if (this.type !== SrtConvert.TYPE_AUTO_GENERATED && this.type !== SrtConvert.TYPE_USER_GENERATED) { 17 | throw new Error(`Unknown subtitles type ${this.type}`); 18 | } 19 | } 20 | 21 | convert() { 22 | let subtitles = ''; 23 | let subtCounter = 1; 24 | const events = this._json['events']; 25 | for (let i = 0; i < events.length; i++) { 26 | const e = events[i]; 27 | const segs = e['segs']; 28 | if (segs) { 29 | let line = ''; 30 | segs.forEach(s => { 31 | line += s['utf8'].replace(/\n/g, ' '); 32 | }) 33 | if (line !== '\n') { 34 | const tStart = e['tStartMs']; 35 | subtitles += `${subtCounter}\n`; 36 | subtitles += `${this._msToHMS(tStart)} --> ${this._msToHMS(tStart + e['dDurationMs'])}\n`; 37 | subtitles += `${line}\n\n`; 38 | subtCounter++; 39 | } 40 | } 41 | } 42 | 43 | this.srt = subtitles; 44 | 45 | return subtitles; 46 | } 47 | 48 | _msToHMS(ms) { 49 | let frac = String(ms % 1000); 50 | frac = ('000' + frac).substring(frac.length); 51 | let sec = Math.floor(ms / 1000); 52 | let hrs = Math.floor(sec / 3600); 53 | sec -= hrs * 3600; 54 | let min = Math.floor(sec / 60); 55 | sec -= min * 60; 56 | sec = ('00' + sec).substring(String(sec).length); 57 | 58 | if (hrs > 0) { 59 | min = ('00' + min).substring(String(min).length); 60 | return ('00' + hrs).substring(String(hrs).length) + ":" + min + ":" + sec + ',' + frac; 61 | } else { 62 | return '00:' + ('00' + min).substring(String(min).length) + ":" + sec + ',' + frac; 63 | } 64 | } 65 | 66 | } 67 | 68 | /** 69 | * This function fetches list of available subtitles from video detail page and then, depending on provided settings, 70 | * fetches subtitle JSONs and converts them to .srt format. 71 | * 72 | * @param page Puppeteer page. 73 | * @param language Preferred language. If `null` or `''`, we are instructed to fetch all available subtitles. 74 | * @param preferAutoGenerated If set to true, we prefer automatically generated subtitles before the user provided. 75 | * If set to false and only automatically generated subtitles are available, we fetch at least them. 76 | * @returns {Promise<*[]>} Promise representing the whole fetching and srt generating process. Promise result is 77 | * list of `SrtConvert` instances containing already converted .srt data. See `SrtConvert` class. 78 | */ 79 | async function fetchSubtitles(page, language = null, preferAutoGenerated = false) { 80 | log.debug(`Fetching subtitles for ${page.url()},lang:${language}...`); 81 | 82 | const converters = []; 83 | const script = await page.evaluate(() => { 84 | const scripts = document.body.querySelectorAll('script'); 85 | let target = null; 86 | scripts.forEach(s => { 87 | const html = s.innerHTML; 88 | if (html.startsWith('var ytInitialPlayerResponse')) { 89 | target = html; 90 | } 91 | }); 92 | return target; 93 | }); 94 | 95 | try { 96 | let subtitlesJSON = JSON.parse(`{${String(script).match(/\"captionTracks\".*?(?=])/)}]}`); 97 | const captionTracks = subtitlesJSON['captionTracks']; 98 | let subtitlesToDl = []; 99 | if (!language) { 100 | for (let i = 0; i < captionTracks.length; i++) { 101 | const track = captionTracks[i]; 102 | subtitlesToDl.push({ 103 | lang: track['languageCode'], 104 | url: `${track['baseUrl']}&fmt=json3`, 105 | type: track['kind'] ? SrtConvert.TYPE_AUTO_GENERATED : SrtConvert.TYPE_USER_GENERATED, 106 | }); 107 | } 108 | } else { 109 | const urlCandidates = []; 110 | for (let i = 0; i < captionTracks.length; i++) { 111 | const track = captionTracks[i]; 112 | if (language === track['languageCode']) { 113 | urlCandidates.push(`${track['baseUrl']}&fmt=json3`); 114 | } 115 | } 116 | for (let i = 0; i < urlCandidates.length; i++) { 117 | const urlCandidate = urlCandidates[i]; 118 | if (preferAutoGenerated) { 119 | if (urlCandidate.includes('&kind=asr')) 120 | subtitlesToDl.push({lang: language, url: urlCandidate, type: SrtConvert.TYPE_AUTO_GENERATED}); 121 | } else { 122 | if (!urlCandidate.includes('&kind=asr')) 123 | subtitlesToDl.push({lang: language, url: urlCandidate, type: SrtConvert.TYPE_USER_GENERATED}); 124 | } 125 | } 126 | if (urlCandidates.length === 0 && urlCandidates.length > 0) 127 | subtitlesToDl = [{lang: language, url: urlCandidates[0]}]; 128 | } 129 | 130 | const fetchingUrls = []; 131 | const fetchingJsons = []; 132 | for (let i = 0; i < subtitlesToDl.length; i++) { 133 | const std = subtitlesToDl[i]; 134 | const pFetch = fetch(std.url, {method: 'GET'}); 135 | fetchingUrls.push(pFetch); 136 | pFetch.then(response => { 137 | const pJson = response.json(); 138 | fetchingJsons.push(pJson); 139 | pJson.then(json => { 140 | log.debug( 141 | `Subtitle type for ${page.url()} lang:${std.lang}, type:${std.type}` + 142 | ` fetched, converting to SRT...` 143 | ); 144 | const conv = new SrtConvert(json, std.lang, std.type) 145 | conv.convert(); 146 | converters.push(conv); 147 | }).catch(reason => { 148 | log.warning( 149 | `Unable to convert subtitles for ${page.url()}, ` + 150 | `language:${std.lang}\nReason:${reason.toString()}` 151 | ) 152 | }); 153 | }).catch(reason => { 154 | log.warning( 155 | `Unable to fetch subtitles for ${page.url()}, ` + 156 | `language:${std.lang}\nReason:${reason.toString()}` 157 | ) 158 | }); 159 | } 160 | await Promise.all(fetchingUrls) 161 | await Promise.all(fetchingJsons) 162 | } catch (e) { 163 | log.warning(`No subtitles found for ${page.url()}.`); 164 | } 165 | 166 | return converters; 167 | } 168 | 169 | async function processFetchedSubtitles(page, videoId, converters, subtitlesSettings) { 170 | let subtitles = null; 171 | if (converters) { 172 | subtitles = []; 173 | for (let i = 0; i < converters.length; i++) { 174 | const c = converters[i]; 175 | let srtUrl = null; 176 | if (subtitlesSettings.saveToKVS) { 177 | const id = `subtitles_${videoId}_${c.language}_${c.type}`; 178 | log.debug( 179 | `Saving subtitles for ${page.url()}, lang:${c.language}, ` + 180 | `type:${c.type} to KeyValueStore, id=${id}` 181 | ); 182 | await subtitlesSettings.kvs.setValue(id, { 183 | subtitles: c.srt, 184 | type: c.type, 185 | language: c.language, 186 | }); 187 | srtUrl = subtitlesSettings.kvs.getPublicUrl(id); 188 | } 189 | subtitles.push({ 190 | srt: c.srt, 191 | srtUrl: srtUrl, 192 | type: c.type, 193 | language: c.language, 194 | }); 195 | } 196 | } 197 | return subtitles; 198 | } 199 | 200 | exports.fetchSubtitles = fetchSubtitles; 201 | exports.processFetchedSubtitles = processFetchedSubtitles; 202 | -------------------------------------------------------------------------------- /test/utility_spec.js: -------------------------------------------------------------------------------- 1 | const { describe, it } = require('mocha'); 2 | const { assert, expect } = require('chai'); 3 | const should = require('chai').should(); 4 | const moment = require('moment'); 5 | 6 | const utils = require('../src/utility'); 7 | 8 | describe('getRandBetween', () => { 9 | const numTestCycles = 1000; 10 | const includeList = [3, 4, 5, 6, 7]; 11 | 12 | it('should return a random number between the given inputs', () => { 13 | // loop to generate random tests 14 | for (let i = 0; i < numTestCycles; i++) { 15 | const result = utils.getRandBetween(3, 7); 16 | assert(includeList.indexOf(result) >= 0, `random value [${result}] is within expected range`); 17 | } 18 | }); 19 | }); 20 | 21 | describe('getRandClickPos', () => { 22 | // start with output of `JSON.stringify(document.createElement('div').getBoundingClientRect())` 23 | // remove right,left,top,bottom to make it compatible with puppeteer boundingBox 24 | const divRect = JSON.parse('{"x":0,"y":0,"width":0,"height":0}'); 25 | divRect.x = 10; 26 | divRect.y = 10; 27 | divRect.width = 50; 28 | divRect.height = 10; 29 | 30 | it('should take a valid puppeteer boundingBox', () => { 31 | divRect.should.have.property('x'); 32 | divRect.should.have.property('y'); 33 | divRect.should.have.property('width'); 34 | divRect.should.have.property('height'); 35 | assert(divRect.width > 0, 'width is greater than zero'); 36 | assert(divRect.height > 0, 'height is greater than zero'); 37 | }); 38 | 39 | it('should select a random point well within the boundaries of a clickable element', () => { 40 | const numTestCycles = 100; 41 | 42 | for (let i = 0; i < numTestCycles; i++) { 43 | const clickPos = utils.getRandClickPos(divRect); 44 | clickPos.should.have.property('xPos'); 45 | clickPos.should.have.property('yPos'); 46 | 47 | const { xPos, yPos } = clickPos; 48 | assert(xPos > divRect.x, 'random x is greater than minimum x'); 49 | assert(xPos < (divRect.x + divRect.width), 'random x is less than maximum x'); 50 | assert(yPos > divRect.y, 'random y is greater than minimum y'); 51 | assert(yPos < (divRect.y + divRect.height), 'random y is less than maximum y'); 52 | } 53 | }); 54 | }); 55 | 56 | describe('categorizeUrl', () => { 57 | it('should categorize different start urls', () => { 58 | expect(utils.categorizeUrl('')).to.equal('MASTER'); 59 | expect(utils.categorizeUrl('/watch?v=394u19u')).to.equal('DETAIL'); 60 | expect(utils.categorizeUrl('https://youtube.com/watch?v=394u19u')).to.equal('DETAIL'); 61 | expect(utils.categorizeUrl('/channel/asdrtsert/videos')).to.equal('CHANNEL'); 62 | expect(utils.categorizeUrl('https://www.youtube.com/user/asdrtsert/videos')).to.equal('CHANNEL'); 63 | expect(utils.categorizeUrl('https://www.youtube.com/c/asdrtsert')).to.equal('CHANNEL'); 64 | expect(utils.categorizeUrl('https://www.youtube.com/results?search_query=hello')).to.equal('SEARCH'); 65 | }); 66 | }); 67 | 68 | describe('getCutoffDate', () => { 69 | it('should return the correct duration for given date string', () => { 70 | const timeNow = moment(); 71 | const numTestCycles = 100; 72 | const durationTypes = ['hours', 'days', 'weeks', 'months', 'years']; 73 | 74 | // loop to generate random tests 75 | let timeThen = null; 76 | let duration = null; 77 | for (let i = 0; i < numTestCycles; i++) { 78 | const selectedIndex = utils.getRandBetween(0, durationTypes.length - 1); 79 | const durType = durationTypes[selectedIndex]; 80 | const count = utils.getRandBetween(1, 9); 81 | 82 | const randInputString = `${count} ${durType} ago`; 83 | 84 | timeThen = utils.getCutoffDate(randInputString); 85 | duration = moment.duration(timeNow.diff(timeThen)); 86 | const newDur = Math.round(duration.as(durType)); 87 | 88 | assert(newDur === count, `getCutoffDate correctly extracted '${newDur}' from '${randInputString}'`); 89 | } 90 | }); 91 | }); 92 | 93 | describe('isDateInputValid', () => { 94 | const isValid = utils.isDateInputValid; 95 | 96 | it('should validate date input as entered by user', () => { 97 | assert(isValid('1 week ago') === true, '1 week ago is valid'); 98 | assert(isValid('1 day ago') === true, '1 day ago is valid'); 99 | assert(isValid('1 hour ago') === true, '1 hour ago is valid'); 100 | assert(isValid('3 weeks ago') === true, '3 weeks ago is valid'); 101 | assert(isValid('2 hours ago') === true, '2 hours ago is valid'); 102 | assert(isValid('13 weeks ago') === true, '13 weeks ago is valid'); 103 | assert(isValid('60 weeks ago') === true, '60 weeks ago is valid'); 104 | assert(isValid('36 hours ago') === true, '36 hours ago is valid'); 105 | assert(isValid('120 minutes ago') === true, '120 minutes ago is valid'); 106 | assert(isValid('3 minutes ago') === true, '3 minutes ago is valid'); 107 | assert(isValid('9 days ago') === true, '9 days ago is valid'); 108 | 109 | assert(isValid('0 days ago') === false, '0 days ago is invalid'); 110 | assert(isValid('400 days ago') === true, '400 days ago is invalid'); 111 | assert(isValid('1 week agos') === false, '1 week agos is invalid'); 112 | assert(isValid('n days ago') === false, 'n days ago is invalid'); 113 | assert(isValid('3 decades ago') === false, '3 decades ago is invalid'); 114 | assert(isValid('minutes ago') === false, 'minutes ago is invalid'); 115 | assert(isValid('hours') === false, 'hours is invalid'); 116 | assert(isValid('ago') === false, 'ago is invalid'); 117 | assert(isValid('60') === false, '60 is invalid'); 118 | assert(isValid('36 # ago') === false, '36 # ago is invalid'); 119 | assert(isValid('120 minutes ago ##') === false, '120 minutes ago ## is invalid'); 120 | }); 121 | }); 122 | 123 | describe('getYoutubeDateFilters', () => { 124 | const filter = utils.getYoutubeDateFilters; 125 | 126 | it('should return the youtube filter corresponding with the users requested date filter', () => { 127 | expect(filter('1 week ago')).to.be.an('array').that.has.members(['Upload date', 'This week']); 128 | expect(filter('1 day ago')).to.be.an('array').that.has.members(['Upload date', 'Today']); 129 | expect(filter('1 hour ago')).to.be.an('array').that.has.members(['Upload date', 'Last hour']); 130 | expect(filter('3 weeks ago')).to.be.an('array').that.has.members(['Upload date', 'This month']); 131 | expect(filter('2 hours ago')).to.be.an('array').that.has.members(['Upload date', 'Today']); 132 | expect(filter('13 weeks ago')).to.be.an('array').that.has.members(['Upload date', 'This year']); 133 | // eslint-disable-next-line no-unused-expressions 134 | expect(filter('60 weeks ago')).to.be.an('array').that.is.empty; 135 | expect(filter('36 hours ago')).to.be.an('array').that.has.members(['Upload date', 'This week']); 136 | expect(filter('120 minutes ago')).to.be.an('array').that.has.members(['Upload date', 'Today']); 137 | expect(filter('3 minutes ago')).to.be.an('array').that.has.members(['Upload date', 'Last hour']); 138 | expect(filter('9 days ago')).to.be.an('array').that.has.members(['Upload date', 'This month']); 139 | // eslint-disable-next-line no-unused-expressions 140 | expect(filter('400 days ago')).to.be.an('array').that.is.empty; 141 | }); 142 | }); 143 | 144 | describe('getVideoId', () => { 145 | const testId = 'jL_nMu9HhfA'; 146 | 147 | const testUrlList = []; 148 | testUrlList.push(`http://www.youtube.com/sandalsResorts#p/c/54B8C800269D7C1B/0/${testId}`); 149 | testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`); 150 | testUrlList.push(`http://youtu.be/${testId}`); 151 | testUrlList.push(`http://www.youtube.com/embed/${testId}`); 152 | testUrlList.push(`https://www.youtube.com/embed/${testId}`); 153 | testUrlList.push(`http://www.youtube.com/v/${testId}?fs=1&hl=en_US`); 154 | testUrlList.push(`http://www.youtube.com/watch?v=${testId}`); 155 | testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`); 156 | testUrlList.push(`http://www.youtube.com/ytscreeningroom?v=${testId}`); 157 | testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`); 158 | testUrlList.push(`http://www.youtube.com/watch?v=${testId}&feature=featured`); 159 | 160 | it('should return the youtube filter corresponding with the users requested date filter', () => { 161 | for (const testURl of testUrlList) { 162 | const videoId = utils.getVideoId(testURl); 163 | assert(videoId.indexOf(testId) >= 0, `${testId} extracted from ${testURl}`); 164 | } 165 | }); 166 | }); 167 | 168 | describe('getMaxVideos', () => { 169 | it('should return the correct number of max videos to use', () => { 170 | assert(utils.getMaxVideos(20, 30) === 20, 'numOfVideos is 20 and userMaximum is 30'); 171 | assert(utils.getMaxVideos(30, 20) === 20, 'numOfVideos is 30 and userMaximum is 20'); 172 | assert(utils.getMaxVideos(0, 30) === 0, 'numOfVideos is 0 and userMaximum is 30'); 173 | assert(utils.getMaxVideos(20, 0) === 20, 'numOfVideos is 20 and userMaximum is 0'); 174 | assert(utils.getMaxVideos(20, 20) === 20, 'numOfVideos is 20 and userMaximum is 20'); 175 | }); 176 | }); 177 | 178 | describe('unformatNumbers', () => { 179 | it('should transform formatted numbers like 1.2K into 1200', () => { 180 | assert(utils.unformatNumbers('1.23M') === 1230000, '1.23M is converted to 1230000'); 181 | assert(utils.unformatNumbers('6.0K') === 6000, '6.0K is converted to 6000'); 182 | assert(utils.unformatNumbers('2B') === 2000000000, '2B is converted to 2000000000'); 183 | assert(utils.unformatNumbers('0K') === 0, '0K is converted to 0'); 184 | assert(utils.unformatNumbers('1K') === 1000, '1K is converted to 1000'); 185 | assert(utils.unformatNumbers('0.24K') === 240, '0.24K is converted to 240'); 186 | }); 187 | }); 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Using the **YouTube scraper**, you can extract data from keyword search results, scrape detailed data on videos, like and dislike ratio and channels, download captions and scrape comment sections. 2 | 3 | Unlike with the [official YouTube API](https://developers.google.com/youtube/v3), with this YouTube scraper, you can scrape the results without quota limits and log in requirement. 4 | 5 | Our YouTube API is open-source and you can easily run it locally or on your system. Contributions are welcome. 6 | 7 | ## Features 8 | 9 | - Scrape videos by specifying a multiple search keywords or URLs to get [video details](https://apify.com/bernardo/youtube-scraper#scraper-output), including e.g. like/dislike ratio. 10 | - Scrape channel details (username, description, number of subscribers etc.) 11 | - **[NEW]** Scrape and download YouTube subtitles and captions (both auto- and user-generated) in any language from any country. 12 | - **[NEW]** Scrape YouTube comment section (no nested comments at the moment though). 13 | 14 | ## Tutorial 15 | 16 | For a more detailed explanation of [how to scrape YouTube](https://blog.apify.com/how-to-scrape-youtube) read a step-by-step tutorial on our [blog](https://blog.apify.com/). 17 | 18 | And for more ideas on how to use the extracted data, check out our [industries pages](https://apify.com/industries) for concrete ways web scraping results are already being used across the projects and businesses of various scale and direction - in [media and marketing](https://apify.com/industries/marketing-and-media), for instance. 19 | 20 | ## Cost of usage 21 | 22 | On average, scraping **1000 items** from YouTube via Apify platform will cost you around **2.5 USD credits** off your subscription plan. For more details about the plans we offer, platform credits and usage, see the [platform pricing page](https://apify.com/pricing/actors). 23 | 24 | If you're not sure how much credits you've got left on your plan and whether you might need to upgrade, you can always check your limits in the *Settings* -> *Usage and Billing* tab in [your Console](https://console.apify.com/). 25 | The easiest way to know how many credits your actor will need is to perform a test run. 26 | 27 | ### Proxy usage 28 | This actor, as most [social media-related scrapers](https://apify.com/store?category=SOCIAL_MEDIA), requires **Proxy servers** to run properly. You can use either your own proxy servers or you can use [Apify Proxies](https://www.apify.com/docs/proxy). We recommend using [dataset proxies](https://help.apify.com/en/articles/5265932-what-is-a-proxy) to achieve the best scraping potential of this actor. 29 | 30 | ## Input parameters 31 | 32 | If this actor is run on our [Platform](https://console.apify.com/), a user-friendly UI there will help you out in configuring all the necessary and optional parameters of this scraper before running it. Our YouTube actor recognizes the following input fields: 33 | 34 | - **searchKeywords** - Your YouTube search query, say *Nimbus 2000 reviews*; this one can be used instead of a URL. 35 | - **startUrls** - A more accurate alternative to **searchKeywords**. By inserting specific URLs from YouTube you can provide search, channel or videos URLs. 36 | - **maxResults** - sets how many videos should be scraped from each search or channel. Defaults to 50, but you can leave it empty for unlimited search. 37 | - **maxComments** - Limits the number of comments that you want to scrape. 0 or empty means no comments will be scraped. 38 | 39 | - **downloadSubtitles** - Scrape both user-generated and auto-generated captions and convert them to SRT format. Boolean value, defaults to false. 40 | - **subtitlesLanguage** - Download only subtitles of the selected language (possible values `"en"`, `"de"`, `"es"`...) 41 | - **preferAutoGeneratedSubtitles** - Prefer the autogenerated speech-to-text subtitles to the user made ones. 42 | - **saveSubsToKVS** - Saves the scraped subtitles in the *Apify Key Value Store*. 43 | - **proxyConfiguration** *(required)* - Configures proxy settings 44 | - **verboseLog** *(required)* - Turns on verbose logging for accurate monitoring and having more details about the runs. 45 | 46 | *See more technical details of the input parameters in the [Input Schema tab](https://apify.com/bernardo/youtube-scraper/input-schema#searchKeywords) of this actor.* 47 | 48 | ### Example 49 | 50 | ```json 51 | { 52 | "searchKeywords": "Terminator dark fate", 53 | "maxResults": 30, 54 | "startUrls": [{ 55 | "url": "https://www.youtube.com/channel/UC8w/videos" // channel videos 56 | }, { 57 | "url": "https://www.youtube.com/results?search_query=finances" // search queries 58 | }, { 59 | "url": "https://www.youtube.com/watch?v=kJQP7kiw5Fk" // videos 60 | }], 61 | "proxyConfiguration": { 62 | "useApifyProxy": true 63 | }, 64 | "verboseLog": false 65 | } 66 | 67 | ``` 68 | 69 | 70 | ## YouTube Scraper output 71 | 72 | After the actor finishes the run, it will store the scraped results in a the *Dataset*. Each YouTube video becomes a separate record in the dataset (see a JSON example below). Using the Apify platform, you can choose to present and download the contents of the dataset in different data formats (JSON, RSS, XML, HTML Table...). 73 | 74 | ### Example 75 | ```json 76 | { 77 | "title": "Terminator: Dark Fate - Official Trailer (2019) - Paramount Pictures", 78 | "id": "oxy8udgWRmo", 79 | "url": "https://www.youtube.com/watch?v=oxy8udgWRmo", 80 | "viewCount": 15432, 81 | "date": "2019-08-29T00:00:00+00:00", 82 | "likes": 121000, 83 | "dislikes": 23000, 84 | "channelName": "Paramount Pictures", 85 | "channelUrl": "https://www.youtube.com/channel/UCF9imwPMSGz4Vq1NiTWCC7g", 86 | "numberOfSubscribers": 1660000, 87 | "details": "Welcome to the day after { 123 | // remove information from the item 124 | item.details = undefined; 125 | // or delete item.details; 126 | return item; 127 | } 128 | ``` 129 | 130 | ```js 131 | async ({ item, page }) => { 132 | // add more info, in this case, the shortLink for the video 133 | const shortLink = await page.evaluate(() => { 134 | const link = document.querySelector('link[rel="shortlinkUrl"]'); 135 | if (link) { 136 | return link.href; 137 | } 138 | }); 139 | 140 | return { 141 | ...item, 142 | shortLink, 143 | } 144 | } 145 | ``` 146 | 147 | ```js 148 | async ({ item }) => { 149 | // omit item, just return null 150 | return null; 151 | } 152 | ``` 153 | 154 | ### Extend scraper function 155 | 156 | Extend scraper function allows you to add functionality to the existing baseline behavior. For example, you may enqueue related videos, but not recursively: 157 | 158 | ```js 159 | async ({ page, request, requestQueue, customData, Apify }) => { 160 | if (request.userData.label === 'DETAIL' && !request.userData.isRelated) { 161 | await page.waitForSelector('ytd-watch-next-secondary-results-renderer'); 162 | 163 | const related = await page.evaluate(() => { 164 | return [...document.querySelectorAll('ytd-watch-next-secondary-results-renderer a[href*="watch?v="]')].map(a => a.href); 165 | }); 166 | 167 | for (const url of related) { 168 | await requestQueue.addRequest({ 169 | url, 170 | userData: { 171 | label: 'DETAIL', 172 | isRelated: true, 173 | }, 174 | }); 175 | } 176 | } 177 | } 178 | ``` 179 | 180 | *NB: If this specific function throws an exception, it will retry the same URL it was visiting again.* 181 | 182 | ## Acknowledgments and personal data 183 | 184 | This scraper collects cookies and privacy consent dialogs on your behalf. Therefore, you should be aware that the results from your YouTube scraping might contain personal data. 185 | 186 | Personal data is protected by GDPR ([EU Regulation 2016/679](https://eur-lex.europa.eu/eli/reg/2016/679/oj)), and by other regulations around the world. You should **not** scrape personal data unless you have a legitimate reason to do so. 187 | 188 | If you're unsure whether your reason is legitimate, consult your lawyers. You can also read our blog post on the [legality of web scraping](https://blog.apify.com/is-web-scraping-legal/). 189 | 190 | ## Other video and social media scrapers 191 | 192 | We have other video-related scrapers in stock for you; to see more of those, check out the [Video Category in Apify Store](https://apify.com/store?category=VIDEOS) or the compilation of [Social Media Scrapers](https://apify.com/store?category=SOCIAL_MEDIA). 193 | 194 | ## Your feedback 195 | 196 | We’re always working on improving the performance of our actors. So if you’ve got any technical feedback about the work of our YouTube API, or simply **found a bug,** please create an issue on the [Github page](https://github.com/bernardro/actor-youtube-scraper) and we’ll get to it. 197 | 198 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Apify Technologies s.r.o. 190 | 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/crawler_utils.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable max-len */ 2 | const moment = require('moment'); 3 | const Apify = require('apify'); 4 | // eslint-disable-next-line no-unused-vars 5 | const Puppeteer = require('puppeteer'); 6 | 7 | const { log, sleep } = Apify.utils; 8 | 9 | const utils = require('./utility'); 10 | const CONSTS = require('./consts'); 11 | const { handleErrorAndScreenshot, unformatNumbers } = require('./utility'); 12 | const { fetchSubtitles, processFetchedSubtitles } = require('./subtitles'); 13 | 14 | /** 15 | * @param {{ 16 | * page: Puppeteer.Page, 17 | * requestQueue: Apify.RequestQueue, 18 | * searchKeywords: string[], 19 | * maxResults: number, 20 | * request: Apify.Request, 21 | * simplifiedInformation: boolean, 22 | * input: object, 23 | * }} config 24 | */ 25 | exports.handleMaster = async ({ page, requestQueue, searchKeywords, maxResults, request, simplifiedInformation, input }) => { 26 | const { searchBox, toggleFilterMenu, filterBtnsXp } = CONSTS.SELECTORS.SEARCH; 27 | const { search, label } = request.userData; 28 | 29 | // Searching only if search was directly provided on input, for other Start URLs, we go directly to scrolling 30 | if (search && label === 'MASTER') { 31 | // we are searching 32 | log.debug('waiting for input box...'); 33 | const searchBxElem = await page.waitForSelector(searchBox, { visible: true }); 34 | if (searchBxElem) { 35 | log.debug(`[${search}]: searchBoxInput found at ${searchBox}`); 36 | } 37 | 38 | log.info(`[${search}]: Entering search text...`); 39 | await utils.doTextInput(page, search); 40 | 41 | // submit search and wait for results page (and filter button) to load 42 | log.info(`[${search}]: Submiting search...`); 43 | 44 | await Promise.allSettled([ 45 | page.tap('#search-icon-legacy'), 46 | page.waitForNavigation({ timeout: 15000 }), 47 | ]); 48 | 49 | // pause while page reloads 50 | await sleep(utils.getDelayMs(CONSTS.DELAY.HUMAN_PAUSE)); 51 | } 52 | 53 | const searchOrUrl = search || request.url; 54 | 55 | log.debug(`[${searchOrUrl}]: waiting for first video to load...`); 56 | const { youtubeVideosSection, youtubeVideosRenderer } = CONSTS.SELECTORS.SEARCH; 57 | // static wait to ensure the page is loaded, networkidle2 sometimes not working? 58 | await page.waitForTimeout(CONSTS.DELAY.START_LOADING_MORE_VIDEOS); 59 | const queuedVideos = await page.$$(`${youtubeVideosSection} ${youtubeVideosRenderer}`); 60 | 61 | // prepare to infinite scroll manually 62 | // puppeteer.infiniteScroll(page) is currently buggy 63 | // see https://github.com/apifytech/apify-js/issues/503 64 | await utils.moveMouseToCenterScreen(page, CONSTS.MOUSE_STEPS); 65 | 66 | // keep scrolling until no more videos or max limit reached 67 | if (queuedVideos.length === 0) { 68 | if (searchKeywords) { 69 | throw `[${searchOrUrl}]: Error: The keywords '${searchKeywords} returned no youtube videos, retrying...`; 70 | } 71 | throw `[${searchOrUrl}]: Error: No videos found`; 72 | } 73 | 74 | log.info(`[${searchOrUrl}]: Starting infinite scrolling downwards to load all the videos...`); 75 | 76 | const maxRequested = (maxResults && maxResults > 0) ? +maxResults : 99999; 77 | 78 | const basicInfoParams = { 79 | page, 80 | maxRequested, 81 | isSearchResultPage: ['SEARCH'].includes(label), 82 | input, 83 | requestUrl: request.url, 84 | }; 85 | 86 | const loadVideosUrlsParams = { 87 | requestQueue, 88 | page, 89 | maxRequested, 90 | isSearchResultPage: ['MASTER', 'SEARCH'].includes(label), 91 | searchOrUrl, 92 | }; 93 | 94 | if (!simplifiedInformation) { 95 | await utils.loadVideosUrls(loadVideosUrlsParams); 96 | } else { 97 | await getBasicInformation(basicInfoParams); 98 | } 99 | }; 100 | 101 | exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSettings, maxComments) => { 102 | const { titleXp, viewCountXp, uploadDateXp, likesXp, dislikesXp, 103 | channelXp, subscribersXp, descriptionXp, durationSlctr, commentsSlctr } = CONSTS.SELECTORS.VIDEO; 104 | 105 | log.info(`handling detail url ${request.url}`); 106 | // Need to scroll twice to get comments. One scroll works locally, but by 17.05.2022 need to scroll twice for platform. 107 | await page.evaluate(() => { 108 | window.scrollBy(window.innerWidth, window.innerHeight); 109 | }); 110 | 111 | await sleep(CONSTS.DELAY.START_LOADING_MORE_VIDEOS); 112 | 113 | await page.evaluate(() => { 114 | window.scrollBy(window.innerWidth, window.innerHeight); 115 | }); 116 | 117 | const videoId = utils.getVideoId(request.url); 118 | log.debug(`got videoId as ${videoId}`); 119 | 120 | // TODO: These getDataFromXpath are bad design as any missing selector with crash the whole page 121 | // Should instead use JQuery or be try/catched 122 | log.debug(`searching for title at ${titleXp}`); 123 | const title = await utils.getDataFromXpath(page, titleXp, 'innerHTML') 124 | .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-title-failed')); 125 | log.debug(`got title as ${title}`); 126 | 127 | log.debug(`searching for viewCount at ${viewCountXp}`); 128 | const viewCountStr = await utils.getDataFromXpath(page, viewCountXp, 'innerHTML') 129 | .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-viewCount-failed')); 130 | const viewCount = utils.unformatNumbers(viewCountStr); 131 | log.debug(`got viewCount as ${viewCountStr} -> ${viewCount}`); 132 | 133 | log.debug(`searching for uploadDate at ${uploadDateXp}`); 134 | const uploadDateStr = await utils.getDataFromXpath(page, uploadDateXp, 'innerHTML') 135 | .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-uploadDate-failed')); 136 | const uploadDateCleaned = uploadDateStr.replace('Premiered', '').trim(); 137 | const uploadDate = moment(uploadDateCleaned, 'MMM DD, YYYY').format(); 138 | log.debug(`got uploadDate as ${uploadDate}, uploadDateStr: ${uploadDateStr}, uploadDateCleaned: ${uploadDateCleaned}`); 139 | 140 | 141 | // YT returns 3 different types of "like" button. Couldn't find any generic selector. Getting info from