├── .gitignore
├── apify.json
├── test
    ├── helpers
    │   └── chai.js
    └── utility_spec.js
├── .editorconfig
├── .eslintrc
├── jsconfig.json
├── package.json
├── Dockerfile
├── CHANGELOG.md
├── src
    ├── consts.js
    ├── main.js
    ├── subtitles.js
    ├── crawler_utils.js
    └── utility.js
├── .actor
    └── actor.json
├── INPUT_SCHEMA.json
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | apify_storage
3 | node_modules
4 | logs
5 | *.log
6 | .DS_Store
7 | package-lock.json
8 | 


--------------------------------------------------------------------------------
/apify.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "youtube-search-scraper",
3 |     "version": "0.0",
4 |     "buildTag": "latest",
5 |     "env": {
6 |     },
7 |     "template": "puppeteer_crawler"
8 | }
9 | 


--------------------------------------------------------------------------------
/test/helpers/chai.js:
--------------------------------------------------------------------------------
1 | const chai = require('chai');
2 | 
3 | chai.config.includeStack = true;
4 | 
5 | global.expect = chai.expect;
6 | global.AssertionError = chai.AssertionError;
7 | global.Assertion = chai.Assertion;
8 | global.assert = chai.assert;
9 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | charset = utf-8
 7 | trim_trailing_whitespace = true
 8 | insert_final_newline = true
 9 | end_of_line = lf
10 | # editorconfig-tools is unable to ignore longs strings or urls
11 | max_line_length = null
12 | 


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "@apify",
 3 |     "env": {
 4 |         "mocha": true
 5 |     },
 6 |     "parserOptions": {
 7 |         "ecmaVersion": 2020
 8 |     },
 9 |     "rules": {
10 |         "no-throw-literal": "off",
11 |         "no-unused-vars": [
12 |             "error",
13 |             {
14 |                 "varsIgnorePattern": "should|expect"
15 |             }
16 |         ]
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/jsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |         "target": "es2018",
 4 |         "module": "commonJS",
 5 |         "lib": [
 6 |             "dom",
 7 |             "dom.iterable",
 8 |             "es5",
 9 |             "es6",
10 |             "es2018",
11 |             "es2019.array",
12 |             "es2019.object",
13 |             "es2020.string"
14 |         ],
15 |         "strict": true,
16 |         "checkJs": true,
17 |         "alwaysStrict": true,
18 |         "moduleResolution": "node",
19 |         "esModuleInterop": true,
20 |         "noImplicitAny": true,
21 |         "noImplicitReturns": false,
22 |         "allowSyntheticDefaultImports": false
23 |     },
24 |     "include": [
25 |         "./src/*.js"
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "actor-youtube-scraper",
 3 |     "version": "0.0.1",
 4 |     "description": "Actor to scrape youtube videos",
 5 |     "keywords": [
 6 |         "apify",
 7 |         "apifier",
 8 |         "crawler",
 9 |         "search",
10 |         "youtube",
11 |         "pupetteer"
12 |     ],
13 |     "dependencies": {
14 |         "apify": "^2.3.0",
15 |         "moment": "^2.29.1",
16 |         "node-fetch": "^2.6.1",
17 |         "puppeteer": "*"
18 |     },
19 |     "devDependencies": {
20 |         "@apify/eslint-config": "^0.1.4",
21 |         "@types/puppeteer": "^5.4.2",
22 |         "chai": "^4.2.0",
23 |         "eslint": "7.17.0",
24 |         "mocha": "^8.2.1",
25 |         "sinon": "^9.2.3"
26 |     },
27 |     "scripts": {
28 |         "start": "node src/main.js",
29 |         "test": "mocha test/utility_spec.js",
30 |         "apify": "apify run"
31 |     },
32 |     "author": "bernardo@sonkomail.com",
33 |     "contributors": [
34 |         "Bernard Okoth <bernardo@sonkomail.com>"
35 |     ],
36 |     "license": "Apache-2.0",
37 |     "homepage": "https://github.com/bernardro/actor-youtube-scraper",
38 |     "repository": {
39 |         "type": "git",
40 |         "url": "git+https://github.com/bernardro/actor-youtube-scraper"
41 |     },
42 |     "bugs": {
43 |         "url": "https://github.com/bernardro/actor-youtube-scraper/issues"
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # First, specify the base Docker image. You can read more about
 2 | # the available images at https://sdk.apify.com/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node-puppeteer-chrome:16
 5 | 
 6 | # Second, copy just package.json and package-lock.json since it should be
 7 | # the only file that affects "npm install" in the next step, to speed up the build
 8 | COPY package*.json ./
 9 | 
10 | # Install NPM packages, skip optional and development dependencies to
11 | # keep the image small. Avoid logging too much and print the dependency
12 | # tree for debugging
13 | RUN npm --quiet set progress=false \
14 |  && npm install --only=prod --no-optional \
15 |  && echo "Installed NPM packages:" \
16 |  && (npm list --only=prod --no-optional --all || true) \
17 |  && echo "Node.js version:" \
18 |  && node --version \
19 |  && echo "NPM version:" \
20 |  && npm --version
21 | 
22 | # Next, copy the remaining files and directories with the source code.
23 | # Since we do this after NPM install, quick build will be really fast
24 | # for most source file changes.
25 | COPY . ./
26 | 
27 | # Optionally, specify how to launch the source code of your actor.
28 | # By default, Apify's base Docker images define the CMD instruction
29 | # that runs the Node.js source code using the command specified
30 | # in the "scripts.start" section of the package.json file.
31 | # In short, the instruction looks something like this:
32 | #
33 | # CMD npm start


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 2022-07-20
 2 | *Fixes*
 3 | - Correctly handle videos with comments turned off. 
 4 | - Add `commentsTurnedOff` to output.
 5 | 
 6 | ## 2022-06-10
 7 | *Fixes*:
 8 | - Channel page without `/watch` selector
 9 | 
10 | ## 2021-09-15
11 | *Features*
12 | - Add possibility to scrape video comments. See `maxComments` input field.
13 | 
14 | 2021-06-16
15 | *Features*
16 | - Revamped subtitles downloading - added possibility to download all available subtitles
17 |   (availability defined by languages) and to prefer automatically generated subtitles before the user generated
18 |   ones.
19 | 
20 | 
21 | 2021-06-14
22 | *Features*:
23 | - Add subtitle type to output (extendedOutputFunction). **Note**: You must set `downloadSubtitles` variable to `true` for this
24 |   feature to take effect.
25 | 
26 | 2021-06-11
27 | *Features*:
28 | - Subtitles are now downloadable (saved to KeyValueStore as `videoID_languageCode`)
29 | 
30 | 2021-05-21
31 | *Features*:
32 | - Update SDK
33 | 
34 | *Fixes*
35 | - Random zero results when searching
36 | - Click consent dialog
37 | 
38 | 2021-04-14
39 | *Fixes*
40 | - Fixed changed selector that completely prevented the scrape
41 | 
42 | 2021-03-21
43 | *Features*:
44 | - Updated SDK version for session pool changes
45 | - Add `handlePageTimeoutSecs` parameter to INPUT_SCHEMA
46 | 
47 | 
48 | 2021-03-15
49 | *Fixes:*
50 | - Fixed selector causing no data scraped
51 | - Removed stealth causing issues with new layout
52 | 
53 | 2020-09-27
54 | - Increased waiting timeouts to better handle concurrency
55 | - Added saving screenshots on errors
56 | - Better handling of Captchas, a page is automatically retried and the browser is restarted with a new proxy
57 | - `verboseLog` is off by default
58 | - Added info how many videos were enqueued and overall better logging
59 | 


--------------------------------------------------------------------------------
/src/consts.js:
--------------------------------------------------------------------------------
 1 | exports.DELAY = {
 2 |     KEY_PRESS: { MIN: 5, MAX: 25 },
 3 |     BTWN_KEY_PRESS: { MIN: 45, MAX: 375 },
 4 |     MOUSE_CLICK: { MIN: 40, MAX: 150 },
 5 |     HUMAN_PAUSE: { MIN: 300, MAX: 800 },
 6 |     START_LOADING_MORE_VIDEOS: 3000,
 7 | };
 8 | 
 9 | exports.MOUSE_STEPS = 5;
10 | 
11 | // 'document', 'image', 'xhr', 'script', 'stylesheet', 'font', 'other', 'manifest'
12 | exports.MEDIA_TYPES = ['image'];
13 | 
14 | exports.LABELS = {
15 |     DETAIL: 'DETAIL',
16 |     MASTER: 'MASTER',
17 |     CHANNEL: 'CHANNEL',
18 |     SEARCH: 'SEARCH',
19 | };
20 | 
21 | exports.SELECTORS = {
22 |     SEARCH: {
23 |         searchBox: 'input#search',
24 |         toggleFilterMenu: '#button[aria-label="Search filters"]',
25 |         filterBtnsXp: '//ytd-search-filter-renderer/a/div/yt-formatted-string',
26 |         youtubeVideosSection: 'ytd-item-section-renderer',
27 |         youtubeVideosRenderer: 'ytd-video-renderer,ytd-grid-video-renderer', // grid is for channels
28 |         url: 'a[href^="/watch"]',
29 |         videoTitle: '#video-title',
30 |         channelNameText: '#channel-name #text-container', // multiple, get first
31 |         subscriberCount: '#subscriber-count',
32 |         canonicalUrl: 'link[rel="canonical"]',
33 |         simplifiedResultVideoTitle: '#video-title',
34 |         simplifiedResultDurationText: '#text',
35 |         simplifiedResultChannelName: '#channel-info > #channel-name',
36 |         simlifiedResultChannelUrl: '#channel-info > a',
37 |         simplifiedResultViewCount: '#metadata-line > span:nth-child(1)',
38 |         simplifiedResultDate: '#metadata-line > span:nth-child(2)',
39 |     },
40 |     VIDEO: {
41 |         titleXp: '//ytd-video-primary-info-renderer/div/h1/yt-formatted-string',
42 |         viewCountXp: '//*[@id="count"]/ytd-video-view-count-renderer/span[1]',
43 |         uploadDateXp: '//ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string',
44 |         likesXp: "//ytd-menu-renderer/div/ytd-toggle-button-renderer[1]/a/*[@id='text']",
45 |         dislikesXp: "//ytd-menu-renderer/div/ytd-toggle-button-renderer[2]/a/*[@id='text']",
46 |         channelXp: '//ytd-channel-name/div/div/yt-formatted-string/a',
47 |         subscribersXp: "//*[@id='owner-sub-count']",
48 |         descriptionXp: '//ytd-expander/div/div/yt-formatted-string',
49 |         durationSlctr: '#movie_player span.ytp-time-duration',
50 |         commentsSlctr: '.count-text',
51 |     },
52 | };
53 | 


--------------------------------------------------------------------------------
/.actor/actor.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "actorSpecification": 1,
 3 |     "name": "youtube-scraper",
 4 |     "title": "Youtube Scraper",
 5 |     "description": "Scrapes Youtube",
 6 |     "version": "0.0.1",
 7 |     "storages": {
 8 |         "dataset": {
 9 |             "actorSpecification": 1,
10 |             "title": "Youtube Scraper",
11 |             "description": "Too see all scraped properties, export the whole dataset or select All fields instead of Overview",
12 |             "views": {
13 |                 "overview": {
14 |                     "title": "Overview",
15 |                     "description": "",
16 |                     "transformation": {
17 |                         "fields": [
18 |                             "title",
19 |                             "id",
20 |                             "url",
21 |                             "viewCount",
22 |                             "date",
23 |                             "likes",
24 |                             "channelName",
25 |                             "channelUrl",
26 |                             "numberOfSubscribers",
27 |                             "duration"
28 |                         ]
29 |                     },
30 |                     "display": {
31 |                         "component": "table",
32 |                         "columns": [
33 |                             {
34 |                                 "label": "Title",
35 |                                 "format": "text",
36 |                                 "field": "title"
37 |                             },
38 |                             {
39 |                                 "label": "URL",
40 |                                 "format": "text",
41 |                                 "field": "url"
42 |                             },
43 |                             {
44 |                                 "label": "Number of views",
45 |                                 "format": "text",
46 |                                 "field": "viewCount"
47 |                             },
48 |                             {
49 |                                 "label": "Date",
50 |                                 "format": "text",
51 |                                 "field": "date"
52 |                             },
53 |                             {
54 |                                 "label": "Number of likes",
55 |                                 "format": "text",
56 |                                 "field": "likes"
57 |                             },
58 |                             {
59 |                                 "label": "Channel name",
60 |                                 "format": "text",
61 |                                 "field": "channelName"
62 |                             },
63 |                             {
64 |                                 "label": "Channel URL",
65 |                                 "format": "text",
66 |                                 "field": "channelUrl"
67 |                             },
68 |                             {
69 |                                 "label": "Number of subscribers",
70 |                                 "format": "text",
71 |                                 "field": "numberOfSubscribers"
72 |                             },
73 |                             {
74 |                                 "label": "duration",
75 |                                 "format": "text",
76 |                                 "field": "duration"
77 |                             }
78 |                         ]
79 |                     }
80 |                 }
81 |             }
82 |         }
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "title": "Input schema for the youtube-search-scraper actor",
  3 |     "type": "object",
  4 |     "schemaVersion": 1,
  5 |     "properties": {
  6 |         "searchKeywords": {
  7 |             "title": "Search keywords",
  8 |             "type": "string",
  9 |             "description": "What to search Youtube for",
 10 |             "editor": "textfield",
 11 |             "prefill": "Madonna",
 12 |             "maxLength" : 250
 13 |         },
 14 |         "maxResults": {
 15 |             "title": "Maximum search results",
 16 |             "type": "integer",
 17 |             "description": "Limit number of videos to crawl; <br />for 'No limits' empty the field",
 18 |             "minimum": 1,
 19 |             "maximum": 999999,
 20 |             "prefill": 50,
 21 |             "unit": "Videos",
 22 |             "nullable": true
 23 |         },
 24 |         "startUrls": {
 25 |             "title": "Direct URLs",
 26 |             "type": "array",
 27 |             "description": "Provide some direct Youtube urls, like channel urls or search urls. When provided, it disables search.",
 28 |             "default": [],
 29 |             "editor": "requestListSources",
 30 |             "sectionCaption": "Direct URLs"
 31 |         },
 32 |         "simplifiedInformation": {
 33 |             "title" : "Get simplified information from a channel",
 34 |             "type": "boolean",
 35 |             "default": false,
 36 |             "description" : "If set to true, we will only get basic video information from the channel. You have to provide a channel Url as StartUrl"
 37 |         },
 38 |         "maxComments": {
 39 |             "title" : "Max comments",
 40 |             "type" : "integer",
 41 |             "description" : "Limit the number of comments that will get scraped. Scraping comments requires scrolling and takes time. 0 or empty means we will not scrape any comments at all. ",
 42 |             "default": 0,
 43 |             "sectionCaption": "Comments & Subtitles",
 44 |             "sectionDescription": "Use this if you want to download comments and / or video subtitles as well."
 45 |         },
 46 |         "downloadSubtitles": {
 47 |             "title" : "Download subtitles",
 48 |             "type": "boolean",
 49 |             "description" : "If set to true, we will also download subtitles for given video and convert it to SRT format"
 50 |         },
 51 |         "saveSubsToKVS": {
 52 |             "title" : "Save downloaded subtitles to Key Value Store",
 53 |             "type": "boolean",
 54 |             "description" : "If set to true, we will save downloaded (and converted to .srt format) subtitles to Key Value Store in format { subtitles: srt_text }. 'downloadSubtitles' must be set to true for this take an effect"
 55 |         },
 56 |         "subtitlesLanguage": {
 57 |             "title" : "Download subtitles in selected language",
 58 |             "type": "string",
 59 |             "description" : "Language to download subtitles in. 'downloadSubtitles' must be set to true for this take an effect",
 60 |             "editor": "select",
 61 |             "default" : "en",
 62 |             "enum" : ["en", "de", "es", "fr", "it", "ja", "ko", "nl", "pt", "ru"],
 63 |             "enumTitles" : ["English", "German", "Spanish", "French", "Italian", "Japanese", "Korean", "Holland", "Portuguese", "Russian"]
 64 |         },
 65 |         "preferAutoGeneratedSubtitles": {
 66 |             "title": "Prefer automatically generated subtitles before the user generated ones.",
 67 |             "description": "If set to `true` and language is provided, we download automatically generated subtitles rather then user ones. If no language is provided, this settings takes no effect",
 68 |             "type": "boolean"
 69 |         },
 70 |         "extendOutputFunction": {
 71 |             "title": "Extend Output Function",
 72 |             "description": "Add or remove properties on the output object or omit the output returning null",
 73 |             "type": "string",
 74 |             "default": "",
 75 |             "prefill": "async ({ data, item, page, request, customData }) => {\n  return item; \n}",
 76 |             "editor": "javascript",
 77 |             "sectionCaption": "Extend scraper functionality",
 78 |             "sectionDescription": "You can change the output of the items for your dataset here, or add additional behavior for"
 79 |         },
 80 |         "extendScraperFunction": {
 81 |             "title": "Extend Scraper Function",
 82 |             "description": "Advanced function that allows you to extend the default scraper functionality, allowing you to manually perform actions on the page",
 83 |             "type": "string",
 84 |             "default": "",
 85 |             "prefill": "async ({ page, request, requestQueue, customData, Apify, extendOutputFunction }) => {\n \n}",
 86 |             "editor": "javascript"
 87 |         },
 88 |         "customData": {
 89 |             "title": "Custom data",
 90 |             "description": "Any data that you want to have available inside the Extend Output/Scraper Function",
 91 |             "default": {},
 92 |             "prefill": {},
 93 |             "type": "object",
 94 |             "editor": "json"
 95 |         },
 96 |         "handlePageTimeoutSecs": {
 97 |             "title": "Handle page timeout",
 98 |             "description": "Set the handlePageTimeout seconds",
 99 |             "default": 3600,
100 |             "prefill": 3600,
101 |             "type": "integer",
102 |             "editor": "number"
103 |         },
104 |         "proxyConfiguration": {
105 |             "title": "Proxy configuration",
106 |             "type": "object",
107 |             "editor": "proxy",
108 |             "description": "The best option is usually Automatic proxy. But you can also use your own proxies or no proxy",
109 |             "prefill": {
110 |                 "useApifyProxy": true
111 |             },
112 |             "default": {
113 |                 "useApifyProxy": true
114 |             },
115 |             "sectionCaption": "Proxy and browser configuration"
116 |         },
117 |         "verboseLog": {
118 |             "title": "Verbose log",
119 |             "type": "boolean",
120 |             "description" : "If set to true, we will save downloaded (and converted to .srt format) subtitles to Key Value Store in format { subtitles: srt_text, language: languageCode, type:autoOrUserGenerated }. 'downloadSubtitles' must be set to true for this take an effect"
121 |         }
122 |     },
123 |     "required": [
124 |         "proxyConfiguration"
125 |     ]
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main.js:
--------------------------------------------------------------------------------
  1 | const Apify = require('apify');
  2 | 
  3 | const utils = require('./utility');
  4 | const crawler = require('./crawler_utils');
  5 | 
  6 | const { log, puppeteer } = Apify.utils;
  7 | 
  8 | Apify.main(async () => {
  9 |     /**
 10 |      * @type {any}
 11 |      */
 12 |     const input = await Apify.getInput();
 13 | 
 14 |     const {
 15 |         verboseLog,
 16 |         startUrls = [],
 17 |         proxyConfiguration,
 18 |         searchKeywords,
 19 |         maxResults,
 20 |         simplifiedInformation = false,
 21 |         // postsFromDate,
 22 |         handlePageTimeoutSecs = 3600,
 23 |         downloadSubtitles = false,
 24 |         saveSubsToKVS: saveSubtitlesToKVS = false,
 25 |         subtitlesLanguage = null,
 26 |         preferAutoGeneratedSubtitles = false,
 27 |         maxComments = 0,
 28 |     } = input;
 29 |     if (verboseLog) {
 30 |         log.setLevel(log.LEVELS.DEBUG);
 31 |     }
 32 |     const kvStore = await Apify.openKeyValueStore();
 33 |     const requestQueue = await Apify.openRequestQueue();
 34 |     const proxyConfig = await utils.proxyConfiguration({
 35 |         proxyConfig: proxyConfiguration,
 36 |     });
 37 | 
 38 |     if (!searchKeywords && (!startUrls || !startUrls.length)) {
 39 |         throw new Error('You need to provide either searchKeywords or startUrls as input');
 40 |     }
 41 | 
 42 |     if (startUrls && startUrls.length) {
 43 |         log.info('Starting scraper with startUrls, ignoring searchKeywords');
 44 | 
 45 |         const parseUrls = await Apify.openRequestList(null, startUrls);
 46 |         let req;
 47 |         // eslint-disable-next-line no-cond-assign
 48 |         while (req = await parseUrls.fetchNextRequest()) {
 49 |             // need to parse for requestsFromUrl first then categorize by path
 50 |             const label = utils.categorizeUrl(req.url);
 51 |             const pUrl = new URL(req.url);
 52 | 
 53 |             if (label === 'CHANNEL' && !pUrl.pathname.includes('/videos')) {
 54 |                 pUrl.pathname = `${pUrl.pathname.split('/').filter((s) => s).join('/')}/videos`;
 55 |                 req.url = pUrl.toString();
 56 |             }
 57 |             await requestQueue.addRequest({
 58 |                 url: req.url,
 59 |                 userData: {
 60 |                     label,
 61 |                 },
 62 |             });
 63 |         }
 64 |     } else if (searchKeywords) {
 65 |         // add starting url
 66 |         log.info('Starting scraper with a search keyword');
 67 | 
 68 |         for (let searchKeyword of searchKeywords.split(',')) {
 69 |             searchKeyword = `${searchKeyword}`.trim();
 70 | 
 71 |             if (searchKeyword) {
 72 |                 await requestQueue.addRequest({
 73 |                     url: 'https://www.youtube.com/',
 74 |                     uniqueKey: `SEARCH-${searchKeyword}`,
 75 |                     userData: {
 76 |                         label: 'MASTER',
 77 |                         search: searchKeyword.trim(),
 78 |                     },
 79 |                 });
 80 |             }
 81 |         }
 82 |     }
 83 | 
 84 |     const extendOutputFunction = await utils.extendFunction({
 85 |         input,
 86 |         key: 'extendOutputFunction',
 87 |         output: async (data) => {
 88 |             await Apify.pushData(data);
 89 |         },
 90 |         helpers: {},
 91 |     });
 92 | 
 93 |     const extendScraperFunction = await utils.extendFunction({
 94 |         input,
 95 |         key: 'extendScraperFunction',
 96 |         output: async () => {}, // no-op for page interaction
 97 |         helpers: {
 98 |             requestQueue,
 99 |             extendOutputFunction,
100 |         },
101 |     });
102 | 
103 |     const pptrCrawler = new Apify.PuppeteerCrawler({
104 |         requestQueue,
105 |         browserPoolOptions: {
106 |             maxOpenPagesPerBrowser: 1,
107 |         },
108 |         useSessionPool: true,
109 |         proxyConfiguration: proxyConfig,
110 |         preNavigationHooks: [
111 |             async ({ page }, gotoOptions) => {
112 |                 await puppeteer.blockRequests(page, {
113 |                     urlPatterns: [
114 |                         '.mp4',
115 |                         '.webp',
116 |                         '.jpeg',
117 |                         '.jpg',
118 |                         '.gif',
119 |                         '.svg',
120 |                         '.ico',
121 |                         '.png',
122 |                         'google-analytics',
123 |                         'doubleclick.net',
124 |                         'googletagmanager',
125 |                         '/videoplayback',
126 |                         '/adview',
127 |                         '/stats/ads',
128 |                         '/stats/watchtime',
129 |                         '/stats/qoe',
130 |                         '/log_event',
131 |                     ],
132 |                 });
133 | 
134 |                 gotoOptions.waitUntil = 'networkidle2';
135 |             },
136 |         ],
137 |         handlePageTimeoutSecs,
138 |         handleFailedRequestFunction: async ({ request }) => {
139 |             Apify.utils.log.error(`Request ${request.url} failed too many times`);
140 | 
141 |             await Apify.pushData({
142 |                 '#debug': Apify.utils.createRequestDebugInfo(request),
143 |             });
144 |         },
145 |         handlePageFunction: async ({ page, request, session, response }) => {
146 |             // no-output function
147 |             await extendScraperFunction(undefined, {
148 |                 page,
149 |                 request,
150 |             });
151 | 
152 |             const hasCaptcha = await page.$('.g-recaptcha');
153 |             if (hasCaptcha) {
154 |                 session.retire();
155 |                 throw 'Got captcha, page will be retried. If this happens often, consider increasing number of proxies';
156 |             }
157 | 
158 |             if (utils.isErrorStatusCode(response.status())) {
159 |                 session.retire();
160 |                 throw `Response status is: ${response.status()} msg: ${response.statusText()}`;
161 |             }
162 | 
163 |             if (page.url().includes('consent')) {
164 |                 log.info('Clicking consent dialog');
165 | 
166 |                 await Promise.all([
167 |                     page.$eval('form[action*="consent"]', (el) => {
168 |                         el.querySelector('button')?.click();
169 |                     }),
170 |                     page.waitForNavigation({ waitUntil: 'networkidle2' }),
171 |                 ]);
172 | 
173 |                 session.retire();
174 |             }
175 | 
176 |             if (await page.$('.yt-upsell-dialog-renderer')) {
177 |                 // this dialog steal focus, so need to click it
178 |                 await page.evaluate(async () => {
179 |                     const noThanks = document.querySelectorAll('.yt-upsell-dialog-renderer [role="button"]');
180 | 
181 |                     for (const button of noThanks) {
182 |                         if (button.textContent && button.textContent.includes('No thanks')) {
183 |                             button.click();
184 |                             break;
185 |                         }
186 |                     }
187 |                 });
188 |             }
189 | 
190 |             switch (request.userData.label) {
191 |                 case 'CHANNEL':
192 |                 case 'SEARCH':
193 |                 case 'MASTER': {
194 |                     await crawler.handleMaster({ page, requestQueue, searchKeywords, maxResults, request, simplifiedInformation, input });
195 |                     break;
196 |                 }
197 |                 case 'DETAIL': {
198 |                     await crawler.handleDetail(
199 |                         page,
200 |                         request,
201 |                         extendOutputFunction,
202 |                         {
203 |                             doDownload: downloadSubtitles,
204 |                             saveToKVS: saveSubtitlesToKVS,
205 |                             language: subtitlesLanguage,
206 |                             kvs: kvStore,
207 |                             preferAutoGenerated: preferAutoGeneratedSubtitles,
208 |                         },
209 |                         maxComments,
210 |                     );
211 |                     break;
212 |                 }
213 |                 default: throw new Error('Unknown request label in handlePageFunction');
214 |             }
215 |         },
216 |     });
217 |     await pptrCrawler.run();
218 | });
219 | 


--------------------------------------------------------------------------------
/src/subtitles.js:
--------------------------------------------------------------------------------
  1 | const Apify = require('apify');
  2 | const {log} = Apify.utils;
  3 | const fetch = require('node-fetch');
  4 | 
  5 | class SrtConvert {
  6 |     static TYPE_AUTO_GENERATED = 'auto_generated';
  7 |     static TYPE_USER_GENERATED = 'user_generated';
  8 | 
  9 |     constructor(srtJson, lang, type = SrtConvert.TYPE_AUTO_GENERATED) {
 10 |         this._json = srtJson;
 11 |         this.language = lang;
 12 |         this.type = type;
 13 | 
 14 |         this.srt = null;
 15 | 
 16 |         if (this.type !== SrtConvert.TYPE_AUTO_GENERATED && this.type !== SrtConvert.TYPE_USER_GENERATED) {
 17 |             throw new Error(`Unknown subtitles type ${this.type}`);
 18 |         }
 19 |     }
 20 | 
 21 |     convert() {
 22 |         let subtitles = '';
 23 |         let subtCounter = 1;
 24 |         const events = this._json['events'];
 25 |         for (let i = 0; i < events.length; i++) {
 26 |             const e = events[i];
 27 |             const segs = e['segs'];
 28 |             if (segs) {
 29 |                 let line = '';
 30 |                 segs.forEach(s => {
 31 |                     line += s['utf8'].replace(/\n/g, ' ');
 32 |                 })
 33 |                 if (line !== '\n') {
 34 |                     const tStart = e['tStartMs'];
 35 |                     subtitles += `${subtCounter}\n`;
 36 |                     subtitles += `${this._msToHMS(tStart)} --> ${this._msToHMS(tStart + e['dDurationMs'])}\n`;
 37 |                     subtitles += `${line}\n\n`;
 38 |                     subtCounter++;
 39 |                 }
 40 |             }
 41 |         }
 42 | 
 43 |         this.srt = subtitles;
 44 | 
 45 |         return subtitles;
 46 |     }
 47 | 
 48 |     _msToHMS(ms) {
 49 |         let frac = String(ms % 1000);
 50 |         frac = ('000' + frac).substring(frac.length);
 51 |         let sec = Math.floor(ms / 1000);
 52 |         let hrs = Math.floor(sec / 3600);
 53 |         sec -= hrs * 3600;
 54 |         let min = Math.floor(sec / 60);
 55 |         sec -= min * 60;
 56 |         sec = ('00' + sec).substring(String(sec).length);
 57 | 
 58 |         if (hrs > 0) {
 59 |             min = ('00' + min).substring(String(min).length);
 60 |             return ('00' + hrs).substring(String(hrs).length) + ":" + min + ":" + sec + ',' + frac;
 61 |         } else {
 62 |             return '00:' + ('00' + min).substring(String(min).length) + ":" + sec + ',' + frac;
 63 |         }
 64 |     }
 65 | 
 66 | }
 67 | 
 68 | /**
 69 |  * This function fetches list of available subtitles from video detail page and then, depending on provided settings,
 70 |  * fetches subtitle JSONs and converts them to .srt format.
 71 |  *
 72 |  * @param page Puppeteer page.
 73 |  * @param language Preferred language. If `null` or `''`, we are instructed to fetch all available subtitles.
 74 |  * @param preferAutoGenerated If set to true, we prefer automatically generated subtitles before the user provided.
 75 |  * If set to false and only automatically generated subtitles are available, we fetch at least them.
 76 |  * @returns {Promise<*[]>} Promise representing the whole fetching and srt generating process. Promise result is
 77 |  * list of `SrtConvert` instances containing already converted .srt data. See `SrtConvert` class.
 78 |  */
 79 | async function fetchSubtitles(page, language = null, preferAutoGenerated = false) {
 80 |     log.debug(`Fetching subtitles for ${page.url()},lang:${language}...`);
 81 | 
 82 |     const converters = [];
 83 |     const script = await page.evaluate(() => {
 84 |         const scripts = document.body.querySelectorAll('script');
 85 |         let target = null;
 86 |         scripts.forEach(s => {
 87 |             const html = s.innerHTML;
 88 |             if (html.startsWith('var ytInitialPlayerResponse')) {
 89 |                 target = html;
 90 |             }
 91 |         });
 92 |         return target;
 93 |     });
 94 | 
 95 |     try {
 96 |         let subtitlesJSON = JSON.parse(`{${String(script).match(/\"captionTracks\".*?(?=])/)}]}`);
 97 |         const captionTracks = subtitlesJSON['captionTracks'];
 98 |         let subtitlesToDl = [];
 99 |         if (!language) {
100 |             for (let i = 0; i < captionTracks.length; i++) {
101 |                 const track = captionTracks[i];
102 |                 subtitlesToDl.push({
103 |                     lang: track['languageCode'],
104 |                     url: `${track['baseUrl']}&fmt=json3`,
105 |                     type: track['kind'] ? SrtConvert.TYPE_AUTO_GENERATED : SrtConvert.TYPE_USER_GENERATED,
106 |                 });
107 |             }
108 |         } else {
109 |             const urlCandidates = [];
110 |             for (let i = 0; i < captionTracks.length; i++) {
111 |                 const track = captionTracks[i];
112 |                 if (language === track['languageCode']) {
113 |                     urlCandidates.push(`${track['baseUrl']}&fmt=json3`);
114 |                 }
115 |             }
116 |             for (let i = 0; i < urlCandidates.length; i++) {
117 |                 const urlCandidate = urlCandidates[i];
118 |                 if (preferAutoGenerated) {
119 |                     if (urlCandidate.includes('&kind=asr'))
120 |                         subtitlesToDl.push({lang: language, url: urlCandidate, type: SrtConvert.TYPE_AUTO_GENERATED});
121 |                 } else {
122 |                     if (!urlCandidate.includes('&kind=asr'))
123 |                         subtitlesToDl.push({lang: language, url: urlCandidate, type: SrtConvert.TYPE_USER_GENERATED});
124 |                 }
125 |             }
126 |             if (urlCandidates.length === 0 && urlCandidates.length > 0)
127 |                 subtitlesToDl = [{lang: language, url: urlCandidates[0]}];
128 |         }
129 | 
130 |         const fetchingUrls = [];
131 |         const fetchingJsons = [];
132 |         for (let i = 0; i < subtitlesToDl.length; i++) {
133 |             const std = subtitlesToDl[i];
134 |             const pFetch = fetch(std.url, {method: 'GET'});
135 |             fetchingUrls.push(pFetch);
136 |             pFetch.then(response => {
137 |                 const pJson = response.json();
138 |                 fetchingJsons.push(pJson);
139 |                 pJson.then(json => {
140 |                     log.debug(
141 |                         `Subtitle type for ${page.url()} lang:${std.lang}, type:${std.type}` +
142 |                         ` fetched, converting to SRT...`
143 |                     );
144 |                     const conv = new SrtConvert(json, std.lang, std.type)
145 |                     conv.convert();
146 |                     converters.push(conv);
147 |                 }).catch(reason => {
148 |                     log.warning(
149 |                         `Unable to convert subtitles for ${page.url()}, ` +
150 |                         `language:${std.lang}\nReason:${reason.toString()}`
151 |                     )
152 |                 });
153 |             }).catch(reason => {
154 |                 log.warning(
155 |                     `Unable to fetch subtitles for ${page.url()}, ` +
156 |                     `language:${std.lang}\nReason:${reason.toString()}`
157 |                 )
158 |             });
159 |         }
160 |         await Promise.all(fetchingUrls)
161 |         await Promise.all(fetchingJsons)
162 |     } catch (e) {
163 |         log.warning(`No subtitles found for ${page.url()}.`);
164 |     }
165 | 
166 |     return converters;
167 | }
168 | 
169 | async function processFetchedSubtitles(page, videoId, converters, subtitlesSettings) {
170 |     let subtitles = null;
171 |     if (converters) {
172 |         subtitles = [];
173 |         for (let i = 0; i < converters.length; i++) {
174 |             const c = converters[i];
175 |             let srtUrl = null;
176 |             if (subtitlesSettings.saveToKVS) {
177 |                 const id = `subtitles_${videoId}_${c.language}_${c.type}`;
178 |                 log.debug(
179 |                     `Saving subtitles for ${page.url()}, lang:${c.language}, ` +
180 |                     `type:${c.type} to KeyValueStore, id=${id}`
181 |                 );
182 |                 await subtitlesSettings.kvs.setValue(id, {
183 |                     subtitles: c.srt,
184 |                     type: c.type,
185 |                     language: c.language,
186 |                 });
187 |                 srtUrl = subtitlesSettings.kvs.getPublicUrl(id);
188 |             }
189 |             subtitles.push({
190 |                 srt: c.srt,
191 |                 srtUrl: srtUrl,
192 |                 type: c.type,
193 |                 language: c.language,
194 |             });
195 |         }
196 |     }
197 |     return subtitles;
198 | }
199 | 
200 | exports.fetchSubtitles = fetchSubtitles;
201 | exports.processFetchedSubtitles = processFetchedSubtitles;
202 | 


--------------------------------------------------------------------------------
/test/utility_spec.js:
--------------------------------------------------------------------------------
  1 | const { describe, it } = require('mocha');
  2 | const { assert, expect } = require('chai');
  3 | const should = require('chai').should();
  4 | const moment = require('moment');
  5 | 
  6 | const utils = require('../src/utility');
  7 | 
  8 | describe('getRandBetween', () => {
  9 |     const numTestCycles = 1000;
 10 |     const includeList = [3, 4, 5, 6, 7];
 11 | 
 12 |     it('should return a random number between the given inputs', () => {
 13 |         // loop to generate random tests
 14 |         for (let i = 0; i < numTestCycles; i++) {
 15 |             const result = utils.getRandBetween(3, 7);
 16 |             assert(includeList.indexOf(result) >= 0, `random value [${result}] is within expected range`);
 17 |         }
 18 |     });
 19 | });
 20 | 
 21 | describe('getRandClickPos', () => {
 22 |     // start with output of `JSON.stringify(document.createElement('div').getBoundingClientRect())`
 23 |     // remove right,left,top,bottom to make it compatible with puppeteer boundingBox
 24 |     const divRect = JSON.parse('{"x":0,"y":0,"width":0,"height":0}');
 25 |     divRect.x = 10;
 26 |     divRect.y = 10;
 27 |     divRect.width = 50;
 28 |     divRect.height = 10;
 29 | 
 30 |     it('should take a valid puppeteer boundingBox', () => {
 31 |         divRect.should.have.property('x');
 32 |         divRect.should.have.property('y');
 33 |         divRect.should.have.property('width');
 34 |         divRect.should.have.property('height');
 35 |         assert(divRect.width > 0, 'width is greater than zero');
 36 |         assert(divRect.height > 0, 'height is greater than zero');
 37 |     });
 38 | 
 39 |     it('should select a random point well within the boundaries of a clickable element', () => {
 40 |         const numTestCycles = 100;
 41 | 
 42 |         for (let i = 0; i < numTestCycles; i++) {
 43 |             const clickPos = utils.getRandClickPos(divRect);
 44 |             clickPos.should.have.property('xPos');
 45 |             clickPos.should.have.property('yPos');
 46 | 
 47 |             const { xPos, yPos } = clickPos;
 48 |             assert(xPos > divRect.x, 'random x is greater than minimum x');
 49 |             assert(xPos < (divRect.x + divRect.width), 'random x is less than maximum x');
 50 |             assert(yPos > divRect.y, 'random y is greater than minimum y');
 51 |             assert(yPos < (divRect.y + divRect.height), 'random y is less than maximum y');
 52 |         }
 53 |     });
 54 | });
 55 | 
 56 | describe('categorizeUrl', () => {
 57 |     it('should categorize different start urls', () => {
 58 |         expect(utils.categorizeUrl('')).to.equal('MASTER');
 59 |         expect(utils.categorizeUrl('/watch?v=394u19u')).to.equal('DETAIL');
 60 |         expect(utils.categorizeUrl('https://youtube.com/watch?v=394u19u')).to.equal('DETAIL');
 61 |         expect(utils.categorizeUrl('/channel/asdrtsert/videos')).to.equal('CHANNEL');
 62 |         expect(utils.categorizeUrl('https://www.youtube.com/user/asdrtsert/videos')).to.equal('CHANNEL');
 63 |         expect(utils.categorizeUrl('https://www.youtube.com/c/asdrtsert')).to.equal('CHANNEL');
 64 |         expect(utils.categorizeUrl('https://www.youtube.com/results?search_query=hello')).to.equal('SEARCH');
 65 |     });
 66 | });
 67 | 
 68 | describe('getCutoffDate', () => {
 69 |     it('should return the correct duration for given date string', () => {
 70 |         const timeNow = moment();
 71 |         const numTestCycles = 100;
 72 |         const durationTypes = ['hours', 'days', 'weeks', 'months', 'years'];
 73 | 
 74 |         // loop to generate random tests
 75 |         let timeThen = null;
 76 |         let duration = null;
 77 |         for (let i = 0; i < numTestCycles; i++) {
 78 |             const selectedIndex = utils.getRandBetween(0, durationTypes.length - 1);
 79 |             const durType = durationTypes[selectedIndex];
 80 |             const count = utils.getRandBetween(1, 9);
 81 | 
 82 |             const randInputString = `${count} ${durType} ago`;
 83 | 
 84 |             timeThen = utils.getCutoffDate(randInputString);
 85 |             duration = moment.duration(timeNow.diff(timeThen));
 86 |             const newDur = Math.round(duration.as(durType));
 87 | 
 88 |             assert(newDur === count, `getCutoffDate correctly extracted '${newDur}' from '${randInputString}'`);
 89 |         }
 90 |     });
 91 | });
 92 | 
 93 | describe('isDateInputValid', () => {
 94 |     const isValid = utils.isDateInputValid;
 95 | 
 96 |     it('should validate date input as entered by user', () => {
 97 |         assert(isValid('1 week ago') === true, '1 week ago is valid');
 98 |         assert(isValid('1 day ago') === true, '1 day ago is valid');
 99 |         assert(isValid('1 hour ago') === true, '1 hour ago is valid');
100 |         assert(isValid('3 weeks ago') === true, '3 weeks ago is valid');
101 |         assert(isValid('2 hours ago') === true, '2 hours ago is valid');
102 |         assert(isValid('13 weeks ago') === true, '13 weeks ago is valid');
103 |         assert(isValid('60 weeks ago') === true, '60 weeks ago is valid');
104 |         assert(isValid('36 hours ago') === true, '36 hours ago is valid');
105 |         assert(isValid('120 minutes ago') === true, '120 minutes ago is valid');
106 |         assert(isValid('3 minutes ago') === true, '3 minutes ago is valid');
107 |         assert(isValid('9 days ago') === true, '9 days ago is valid');
108 | 
109 |         assert(isValid('0 days ago') === false, '0 days ago is invalid');
110 |         assert(isValid('400 days ago') === true, '400 days ago is invalid');
111 |         assert(isValid('1 week agos') === false, '1 week agos is invalid');
112 |         assert(isValid('n days ago') === false, 'n days ago is invalid');
113 |         assert(isValid('3 decades ago') === false, '3 decades ago is invalid');
114 |         assert(isValid('minutes ago') === false, 'minutes ago is invalid');
115 |         assert(isValid('hours') === false, 'hours is invalid');
116 |         assert(isValid('ago') === false, 'ago is invalid');
117 |         assert(isValid('60') === false, '60 is invalid');
118 |         assert(isValid('36 # ago') === false, '36 # ago is invalid');
119 |         assert(isValid('120 minutes ago ##') === false, '120 minutes ago ## is invalid');
120 |     });
121 | });
122 | 
123 | describe('getYoutubeDateFilters', () => {
124 |     const filter = utils.getYoutubeDateFilters;
125 | 
126 |     it('should return the youtube filter corresponding with the users requested date filter', () => {
127 |         expect(filter('1 week ago')).to.be.an('array').that.has.members(['Upload date', 'This week']);
128 |         expect(filter('1 day ago')).to.be.an('array').that.has.members(['Upload date', 'Today']);
129 |         expect(filter('1 hour ago')).to.be.an('array').that.has.members(['Upload date', 'Last hour']);
130 |         expect(filter('3 weeks ago')).to.be.an('array').that.has.members(['Upload date', 'This month']);
131 |         expect(filter('2 hours ago')).to.be.an('array').that.has.members(['Upload date', 'Today']);
132 |         expect(filter('13 weeks ago')).to.be.an('array').that.has.members(['Upload date', 'This year']);
133 |         // eslint-disable-next-line no-unused-expressions
134 |         expect(filter('60 weeks ago')).to.be.an('array').that.is.empty;
135 |         expect(filter('36 hours ago')).to.be.an('array').that.has.members(['Upload date', 'This week']);
136 |         expect(filter('120 minutes ago')).to.be.an('array').that.has.members(['Upload date', 'Today']);
137 |         expect(filter('3 minutes ago')).to.be.an('array').that.has.members(['Upload date', 'Last hour']);
138 |         expect(filter('9 days ago')).to.be.an('array').that.has.members(['Upload date', 'This month']);
139 |         // eslint-disable-next-line no-unused-expressions
140 |         expect(filter('400 days ago')).to.be.an('array').that.is.empty;
141 |     });
142 | });
143 | 
144 | describe('getVideoId', () => {
145 |     const testId = 'jL_nMu9HhfA';
146 | 
147 |     const testUrlList = [];
148 |     testUrlList.push(`http://www.youtube.com/sandalsResorts#p/c/54B8C800269D7C1B/0/${testId}`);
149 |     testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`);
150 |     testUrlList.push(`http://youtu.be/${testId}`);
151 |     testUrlList.push(`http://www.youtube.com/embed/${testId}`);
152 |     testUrlList.push(`https://www.youtube.com/embed/${testId}`);
153 |     testUrlList.push(`http://www.youtube.com/v/${testId}?fs=1&hl=en_US`);
154 |     testUrlList.push(`http://www.youtube.com/watch?v=${testId}`);
155 |     testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`);
156 |     testUrlList.push(`http://www.youtube.com/ytscreeningroom?v=${testId}`);
157 |     testUrlList.push(`http://www.youtube.com/user/Scobleizer#p/u/1/1${testId}`);
158 |     testUrlList.push(`http://www.youtube.com/watch?v=${testId}&feature=featured`);
159 | 
160 |     it('should return the youtube filter corresponding with the users requested date filter', () => {
161 |         for (const testURl of testUrlList) {
162 |             const videoId = utils.getVideoId(testURl);
163 |             assert(videoId.indexOf(testId) >= 0, `${testId} extracted from ${testURl}`);
164 |         }
165 |     });
166 | });
167 | 
168 | describe('getMaxVideos', () => {
169 |     it('should return the correct number of max videos to use', () => {
170 |         assert(utils.getMaxVideos(20, 30) === 20, 'numOfVideos is 20 and userMaximum is 30');
171 |         assert(utils.getMaxVideos(30, 20) === 20, 'numOfVideos is 30 and userMaximum is 20');
172 |         assert(utils.getMaxVideos(0, 30) === 0, 'numOfVideos is 0 and userMaximum is 30');
173 |         assert(utils.getMaxVideos(20, 0) === 20, 'numOfVideos is 20 and userMaximum is 0');
174 |         assert(utils.getMaxVideos(20, 20) === 20, 'numOfVideos is 20 and userMaximum is 20');
175 |     });
176 | });
177 | 
178 | describe('unformatNumbers', () => {
179 |     it('should transform formatted numbers like 1.2K into 1200', () => {
180 |         assert(utils.unformatNumbers('1.23M') === 1230000, '1.23M is converted to 1230000');
181 |         assert(utils.unformatNumbers('6.0K') === 6000, '6.0K is converted to 6000');
182 |         assert(utils.unformatNumbers('2B') === 2000000000, '2B is converted to 2000000000');
183 |         assert(utils.unformatNumbers('0K') === 0, '0K is converted to 0');
184 |         assert(utils.unformatNumbers('1K') === 1000, '1K is converted to 1000');
185 |         assert(utils.unformatNumbers('0.24K') === 240, '0.24K is converted to 240');
186 |     });
187 | });
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Using the **YouTube scraper**, you can extract data from keyword search results, scrape detailed data on videos, like and dislike ratio and channels, download captions and scrape comment sections. 
  2 | 
  3 | Unlike with the [official YouTube API](https://developers.google.com/youtube/v3), with this YouTube scraper, you can scrape the results without quota limits and log in requirement. 
  4 | 
  5 | Our YouTube API is open-source and you can easily run it locally or on your system. Contributions are welcome.
  6 | 
  7 | ## Features
  8 | 
  9 | - Scrape videos by specifying a multiple search keywords or URLs to get [video details](https://apify.com/bernardo/youtube-scraper#scraper-output), including e.g. like/dislike ratio.
 10 | - Scrape channel details (username, description, number of subscribers etc.)
 11 | - **[NEW]** Scrape and download YouTube subtitles and captions (both auto- and user-generated) in any language from any country. 
 12 | - **[NEW]** Scrape YouTube comment section (no nested comments at the moment though).
 13 | 
 14 | ## Tutorial
 15 | 
 16 | For a more detailed explanation of [how to scrape YouTube](https://blog.apify.com/how-to-scrape-youtube) read a step-by-step tutorial on our [blog](https://blog.apify.com/).
 17 | 
 18 | And for more ideas on how to use the extracted data, check out our [industries pages](https://apify.com/industries) for concrete ways web scraping results are already being used across the projects and businesses of various scale and direction - in [media and marketing](https://apify.com/industries/marketing-and-media), for instance.
 19 | 
 20 | ## Cost of usage
 21 | 
 22 | On average, scraping **1000 items** from YouTube via Apify platform will cost you around **2.5 USD credits** off your subscription plan. For more details about the plans we offer, platform credits and usage, see the [platform pricing page](https://apify.com/pricing/actors).
 23 | 
 24 | If you're not sure how much credits you've got left on your plan and whether you might need to upgrade, you can always check your limits in the *Settings* -> *Usage and Billing* tab in [your Console](https://console.apify.com/).  
 25 | The easiest way to know how many credits your actor will need is to perform a test run. 
 26 | 
 27 | ### Proxy usage
 28 | This actor, as most [social media-related scrapers](https://apify.com/store?category=SOCIAL_MEDIA), requires **Proxy servers** to run properly. You can use either your own proxy servers or you can use  [Apify Proxies](https://www.apify.com/docs/proxy). We recommend using [dataset proxies](https://help.apify.com/en/articles/5265932-what-is-a-proxy) to achieve the best scraping potential of this actor.
 29 | 
 30 | ## Input parameters
 31 | 
 32 | If this actor is run on our [Platform](https://console.apify.com/), a user-friendly UI there will help you out in configuring all the necessary and optional parameters of this scraper before running it. Our YouTube actor recognizes the following input fields:
 33 | 
 34 |  - **searchKeywords** - Your YouTube search query, say *Nimbus 2000 reviews*; this one can be used instead of a URL.
 35 | 	 -  **startUrls** - A more accurate alternative to **searchKeywords**. By inserting specific URLs from YouTube you can provide search, channel or videos URLs.
 36 |  - **maxResults** - sets how many videos should be scraped from each search or channel. Defaults to 50, but you can leave it empty for unlimited search.
 37 |  - **maxComments** - Limits the number of comments that you want to scrape.  0 or empty means no comments will be scraped.
 38 | 
 39 |  - **downloadSubtitles** - Scrape both user-generated and auto-generated captions and convert them to SRT format. Boolean value, defaults to false.
 40 | 	 - **subtitlesLanguage** - Download only subtitles of the selected language (possible values `"en"`, `"de"`, `"es"`...)
 41 | 	 - **preferAutoGeneratedSubtitles** -  Prefer the autogenerated speech-to-text subtitles to the user made ones.
 42 | 	 - **saveSubsToKVS** - Saves the scraped subtitles in the *Apify Key Value Store*.
 43 |  - **proxyConfiguration** *(required)* - Configures proxy settings
 44 |  - **verboseLog** *(required)* - Turns on verbose logging for accurate monitoring and having more details about the runs.
 45 |  
 46 | *See more technical details of the input parameters in the [Input Schema tab](https://apify.com/bernardo/youtube-scraper/input-schema#searchKeywords) of this actor.*
 47 | 
 48 | ### Example
 49 | 
 50 | ```json
 51 | {
 52 |     "searchKeywords": "Terminator dark fate",
 53 |     "maxResults": 30,
 54 |     "startUrls": [{
 55 |         "url": "https://www.youtube.com/channel/UC8w/videos" // channel videos
 56 |     }, {
 57 |         "url": "https://www.youtube.com/results?search_query=finances" // search queries
 58 |     }, {
 59 |         "url": "https://www.youtube.com/watch?v=kJQP7kiw5Fk" // videos
 60 |     }],
 61 |     "proxyConfiguration": {
 62 |         "useApifyProxy": true
 63 |     },
 64 |     "verboseLog": false
 65 | }
 66 | 
 67 | ```
 68 | 
 69 | 
 70 | ## YouTube Scraper output
 71 | 
 72 | After the actor finishes the run, it will store the scraped results in a the *Dataset*. Each YouTube video becomes a separate record in the dataset  (see a JSON example below). Using the Apify platform, you can choose to present and download the contents of the dataset in different data formats (JSON, RSS, XML, HTML Table...).
 73 | 
 74 | ### Example
 75 | ```json
 76 | {
 77 |   "title": "Terminator: Dark Fate - Official Trailer (2019) - Paramount Pictures",
 78 |   "id": "oxy8udgWRmo",
 79 |   "url": "https://www.youtube.com/watch?v=oxy8udgWRmo",
 80 |   "viewCount": 15432,
 81 |   "date": "2019-08-29T00:00:00+00:00",
 82 |   "likes": 121000,
 83 |   "dislikes": 23000,
 84 |   "channelName": "Paramount Pictures",
 85 |   "channelUrl": "https://www.youtube.com/channel/UCF9imwPMSGz4Vq1NiTWCC7g",
 86 |   "numberOfSubscribers": 1660000,
 87 |   "details": "Welcome to the day after <a class=\"yt-simple-endpoint style-sco..."
 88 | }
 89 | 
 90 | ```
 91 | See the  [Apify API reference](https://www.apify.com/docs/api)  to learn in more detail about getting results from this YouTube Scraper.
 92 | 
 93 | ### **How can you use the data extracted from YouTube:**
 94 | 
 95 | - **Compile reviews of products and services** - make purchasing and investment decisions backed by data.
 96 | 
 97 | - **Monitor YouTube for brand awareness** - keep track of brand mentions, audience reach and web reputation.
 98 | 
 99 | - **Estimate the impact of YouTube campaigns** - estimate ROI for advertisement or referrals from YouTube channels and scale marketing campaigns accordingly.
100 | 
101 | -  **Apply scraped data in journalism** - track down and tackle fake news, bot activity, as well as illegal, misleading or harmful content. Dissect big news topics and analyze sentiment on web.
102 | -   **Collect data for any kind of research** -  identify and follow emerging trends or topics and even predict the new ones: globally or by country and language.
103 | ## Changelog
104 | 
105 | You can see all newest changes to this YouTube scraper listed in this  [CHANGELOG.md](https://github.com/bernardro/actor-youtube-scraper/blob/master/CHANGELOG.md) file.
106 | 
107 | ## Notes for the developers on customizing the actor
108 | Here are the calculations for a typical resource usage of YouTube Scraper on Apify platform:
109 | 
110 | | **Resource** | **Average** | **Max** |
111 | |--|--|--|
112 | |  Memory| 480.3 MB | 1.1 GB |
113 | | CPU | 53% | 140% |
114 | 
115 | This actor uses xPaths to find DOM elements; they are all stored in one file for easy update. All xPath variables and functions end in 'Xp'.
116 | 
117 | ### Extend output function
118 | 
119 | Extend output function allows you to omit output, add some extra properties to the output by using the  `page`  variable or change the shape of your output altogether:
120 | 
121 | ```js
122 | async ({ item }) => {
123 |     // remove information from the item
124 |     item.details = undefined;
125 |     // or delete item.details;
126 |     return item;
127 | }
128 | ```
129 | 
130 | ```js
131 | async ({ item, page }) => {
132 |     // add more info, in this case, the shortLink for the video
133 |     const shortLink = await page.evaluate(() => {
134 |         const link = document.querySelector('link[rel="shortlinkUrl"]');
135 |         if (link) {
136 |             return link.href;
137 |         }
138 |     });
139 | 
140 |     return {
141 |         ...item,
142 |         shortLink,
143 |     }
144 | }
145 | ```
146 | 
147 | ```js
148 | async ({ item }) => {
149 |     // omit item, just return null
150 |     return null;
151 | }
152 | ```
153 | 
154 | ### Extend scraper function
155 | 
156 | Extend scraper function allows you to add functionality to the existing baseline behavior. For example, you may enqueue related videos, but not recursively:
157 | 
158 | ```js
159 | async ({ page, request, requestQueue, customData, Apify }) => {
160 |     if (request.userData.label === 'DETAIL' && !request.userData.isRelated) {
161 |         await page.waitForSelector('ytd-watch-next-secondary-results-renderer');
162 | 
163 |         const related = await page.evaluate(() => {
164 |             return [...document.querySelectorAll('ytd-watch-next-secondary-results-renderer a[href*="watch?v="]')].map(a => a.href);
165 |         });
166 | 
167 |         for (const url of related) {
168 |             await requestQueue.addRequest({
169 |                 url,
170 |                 userData: {
171 |                     label: 'DETAIL',
172 |                     isRelated: true,
173 |                 },
174 |             });
175 |         }
176 |     }
177 | }
178 | ```
179 | 
180 | *NB: If this specific function throws an exception, it will retry the same URL it was visiting again.*
181 | 
182 | ## Acknowledgments and personal data
183 | 
184 | This scraper collects cookies and privacy consent dialogs on your behalf. Therefore, you should be aware that the results from your YouTube scraping might contain personal data. 
185 | 
186 | Personal data is protected by GDPR ([EU Regulation 2016/679](https://eur-lex.europa.eu/eli/reg/2016/679/oj)),  and by other regulations around the world. You should **not** scrape personal data unless you have a legitimate reason to do so. 
187 | 
188 | If you're unsure whether your reason is legitimate, consult your lawyers. You can also read our blog post on the  [legality of web scraping](https://blog.apify.com/is-web-scraping-legal/).
189 | 
190 | ## Other video and social media scrapers
191 | 
192 | We have other video-related scrapers in stock for you; to see more of those, check out the [Video Category in Apify Store](https://apify.com/store?category=VIDEOS) or the compilation of [Social Media Scrapers](https://apify.com/store?category=SOCIAL_MEDIA). 
193 | 
194 | ## Your feedback
195 | 
196 | We’re always working on improving the performance of our actors. So if you’ve got any technical feedback about the work of our YouTube API, or simply **found a bug,** please create an issue on the [Github page](https://github.com/bernardro/actor-youtube-scraper) and we’ll get to it.
197 | 
198 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017 Apify Technologies s.r.o.
190 | 
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/src/crawler_utils.js:
--------------------------------------------------------------------------------
  1 | /* eslint-disable max-len */
  2 | const moment = require('moment');
  3 | const Apify = require('apify');
  4 | // eslint-disable-next-line no-unused-vars
  5 | const Puppeteer = require('puppeteer');
  6 | 
  7 | const { log, sleep } = Apify.utils;
  8 | 
  9 | const utils = require('./utility');
 10 | const CONSTS = require('./consts');
 11 | const { handleErrorAndScreenshot, unformatNumbers } = require('./utility');
 12 | const { fetchSubtitles, processFetchedSubtitles } = require('./subtitles');
 13 | 
 14 | /**
 15 |  * @param {{
 16 |  *  page: Puppeteer.Page,
 17 |  *  requestQueue: Apify.RequestQueue,
 18 |  *  searchKeywords: string[],
 19 |  *  maxResults: number,
 20 |  *  request: Apify.Request,
 21 |  *  simplifiedInformation: boolean,
 22 |  *  input: object,
 23 |  * }} config
 24 |  */
 25 | exports.handleMaster = async ({ page, requestQueue, searchKeywords, maxResults, request, simplifiedInformation, input }) => {
 26 |     const { searchBox, toggleFilterMenu, filterBtnsXp } = CONSTS.SELECTORS.SEARCH;
 27 |     const { search, label } = request.userData;
 28 | 
 29 |     // Searching only if search was directly provided on input, for other Start URLs, we go directly to scrolling
 30 |     if (search && label === 'MASTER') {
 31 |         // we are searching
 32 |         log.debug('waiting for input box...');
 33 |         const searchBxElem = await page.waitForSelector(searchBox, { visible: true });
 34 |         if (searchBxElem) {
 35 |             log.debug(`[${search}]: searchBoxInput found at ${searchBox}`);
 36 |         }
 37 | 
 38 |         log.info(`[${search}]: Entering search text...`);
 39 |         await utils.doTextInput(page, search);
 40 | 
 41 |         // submit search and wait for results page (and filter button) to load
 42 |         log.info(`[${search}]: Submiting search...`);
 43 | 
 44 |         await Promise.allSettled([
 45 |             page.tap('#search-icon-legacy'),
 46 |             page.waitForNavigation({ timeout: 15000 }),
 47 |         ]);
 48 | 
 49 |         // pause while page reloads
 50 |         await sleep(utils.getDelayMs(CONSTS.DELAY.HUMAN_PAUSE));
 51 |     }
 52 | 
 53 |     const searchOrUrl = search || request.url;
 54 | 
 55 |     log.debug(`[${searchOrUrl}]: waiting for first video to load...`);
 56 |     const { youtubeVideosSection, youtubeVideosRenderer } = CONSTS.SELECTORS.SEARCH;
 57 |     // static wait to ensure the page is loaded, networkidle2 sometimes not working?
 58 |     await page.waitForTimeout(CONSTS.DELAY.START_LOADING_MORE_VIDEOS);
 59 |     const queuedVideos = await page.$$(`${youtubeVideosSection} ${youtubeVideosRenderer}`);
 60 | 
 61 |     // prepare to infinite scroll manually
 62 |     // puppeteer.infiniteScroll(page) is currently buggy
 63 |     // see https://github.com/apifytech/apify-js/issues/503
 64 |     await utils.moveMouseToCenterScreen(page, CONSTS.MOUSE_STEPS);
 65 | 
 66 |     // keep scrolling until no more videos or max limit reached
 67 |     if (queuedVideos.length === 0) {
 68 |         if (searchKeywords) {
 69 |             throw `[${searchOrUrl}]: Error: The keywords '${searchKeywords} returned no youtube videos, retrying...`;
 70 |         }
 71 |         throw `[${searchOrUrl}]: Error: No videos found`;
 72 |     }
 73 | 
 74 |     log.info(`[${searchOrUrl}]: Starting infinite scrolling downwards to load all the videos...`);
 75 | 
 76 |     const maxRequested = (maxResults && maxResults > 0) ? +maxResults : 99999;
 77 | 
 78 |     const basicInfoParams = {
 79 |         page,
 80 |         maxRequested,
 81 |         isSearchResultPage: ['SEARCH'].includes(label),
 82 |         input,
 83 |         requestUrl: request.url,
 84 |     };
 85 | 
 86 |     const loadVideosUrlsParams = {
 87 |         requestQueue,
 88 |         page,
 89 |         maxRequested,
 90 |         isSearchResultPage: ['MASTER', 'SEARCH'].includes(label),
 91 |         searchOrUrl,
 92 |     };
 93 | 
 94 |     if (!simplifiedInformation) {
 95 |         await utils.loadVideosUrls(loadVideosUrlsParams);
 96 |     } else {
 97 |         await getBasicInformation(basicInfoParams);
 98 |     }
 99 | };
100 | 
101 | exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSettings, maxComments) => {
102 |     const { titleXp, viewCountXp, uploadDateXp, likesXp, dislikesXp,
103 |         channelXp, subscribersXp, descriptionXp, durationSlctr, commentsSlctr } = CONSTS.SELECTORS.VIDEO;
104 | 
105 |     log.info(`handling detail url ${request.url}`);
106 |     // Need to scroll twice to get comments. One scroll works locally, but by 17.05.2022 need to scroll twice for platform.
107 |     await page.evaluate(() => {
108 |         window.scrollBy(window.innerWidth, window.innerHeight);
109 |     });
110 | 
111 |     await sleep(CONSTS.DELAY.START_LOADING_MORE_VIDEOS);
112 | 
113 |     await page.evaluate(() => {
114 |         window.scrollBy(window.innerWidth, window.innerHeight);
115 |     });
116 | 
117 |     const videoId = utils.getVideoId(request.url);
118 |     log.debug(`got videoId as ${videoId}`);
119 | 
120 |     // TODO: These getDataFromXpath are bad design as any missing selector with crash the whole page
121 |     // Should instead use JQuery or be try/catched
122 |     log.debug(`searching for title at ${titleXp}`);
123 |     const title = await utils.getDataFromXpath(page, titleXp, 'innerHTML')
124 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-title-failed'));
125 |     log.debug(`got title as ${title}`);
126 | 
127 |     log.debug(`searching for viewCount at ${viewCountXp}`);
128 |     const viewCountStr = await utils.getDataFromXpath(page, viewCountXp, 'innerHTML')
129 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-viewCount-failed'));
130 |     const viewCount = utils.unformatNumbers(viewCountStr);
131 |     log.debug(`got viewCount as ${viewCountStr} -> ${viewCount}`);
132 | 
133 |     log.debug(`searching for uploadDate at ${uploadDateXp}`);
134 |     const uploadDateStr = await utils.getDataFromXpath(page, uploadDateXp, 'innerHTML')
135 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-uploadDate-failed'));
136 |     const uploadDateCleaned = uploadDateStr.replace('Premiered', '').trim();
137 |     const uploadDate = moment(uploadDateCleaned, 'MMM DD, YYYY').format();
138 |     log.debug(`got uploadDate as ${uploadDate}, uploadDateStr: ${uploadDateStr}, uploadDateCleaned: ${uploadDateCleaned}`);
139 | 
140 | 
141 |     // YT returns 3 different types of "like" button. Couldn't find any generic selector. Getting info from <script>
142 |     const likesStr = await page.evaluate(() => {
143 |         const allScriptsArray = document.body.querySelectorAll('script');
144 |         const allScriptsText = [];
145 |         allScriptsArray.forEach((el) => {
146 |             const text = el.innerHTML;
147 |             allScriptsText.push(text);
148 |         });
149 |         let neededScript = null;
150 |         allScriptsText.map((el) => {
151 |             // NOTE (for future): there is some other useful info in JSON, that we can use just in case.
152 |             if (el.includes('var ytInitialData')) {
153 |                 neededScript = el;
154 |             }
155 |         });
156 |         
157 |         // JSON has invalid chars, can't parse it. Dealing with it as a string
158 |         const result = neededScript.split('likes')[0].split('label')[1];// example string: '":"19,419 '
159 |         return result;
160 |     });
161 | 
162 |     const likesCount = utils.unformatNumbers(likesStr);
163 |     log.debug(`got likesCount as ${likesCount}`);
164 | 
165 |     log.debug(`searching for dislikesCount at ${dislikesXp}`);
166 |     const dislikesStr = await utils.getDataFromXpath(page, dislikesXp, 'innerHTML')
167 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-dislikesCount-failed'));
168 |     const dislikesCount = utils.unformatNumbers(dislikesStr);
169 |     log.debug(`got dislikesCount as ${dislikesCount}`);
170 | 
171 |     log.debug(`searching for channel details at ${channelXp}`);
172 |     const channelName = await utils.getDataFromXpath(page, channelXp, 'innerHTML')
173 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-channelName-failed'));
174 |     log.debug(`got channelName as ${channelName}`);
175 |     const channelUrl = await utils.getDataFromXpath(page, channelXp, 'href')
176 |         .catch((e) => handleErrorAndScreenshot(page, e, 'Getting-channelUrl-failed'));
177 |     log.debug(`got channelUrl as ${channelUrl}`);
178 | 
179 |     log.debug(`searching for numberOfSubscribers at ${subscribersXp}`);
180 |     const subscribersStr = await utils.getDataFromXpath(page, subscribersXp, 'innerHTML');
181 |     const numberOfSubscribers = utils.unformatNumbers(subscribersStr.replace(/subscribers/ig, '').trim());
182 |     log.debug(`got numberOfSubscribers as ${numberOfSubscribers}`);
183 | 
184 |     log.debug(`searching for videoDuration at ${durationSlctr}`);
185 |     const durationStr = await utils.getDataFromSelector(page, durationSlctr, 'innerHTML');
186 |     log.debug(`got videoDuration as ${durationStr}`);
187 | 
188 |     const commentsText = await page.$eval('#comments #contents', (el) => el.textContent);
189 |     const commentsTurnedOff = commentsText?.trim().startsWith('Comments are turned off');
190 |     log.debug(`searching for comments Count at ${commentsSlctr}`);
191 | 
192 |     const commentsCount = commentsTurnedOff
193 |         ? 0
194 |         : await utils.getDataFromSelector(page, commentsSlctr, 'innerText');
195 |     log.debug(`got comments Count as ${commentsCount}`);
196 | 
197 |     const description = await utils.getDataFromXpath(page, descriptionXp, 'innerHTML');
198 |     const text = await utils.getDataFromXpath(page, descriptionXp, 'innerText');
199 | 
200 |     let subtitles = null;
201 |     if (subtitlesSettings.doDownload) {
202 |         const converters = await fetchSubtitles(
203 |             page, subtitlesSettings.language, subtitlesSettings.preferAutoGenerated,
204 |         );
205 |         subtitles = await processFetchedSubtitles(page, videoId, converters, subtitlesSettings);
206 |     }
207 | 
208 |     let comments = null;
209 |     if (maxComments > 0) {
210 |         comments = await utils.getVideoComments(page, maxComments);
211 |     }
212 | 
213 |     await extendOutputFunction({
214 |         title,
215 |         id: videoId,
216 |         url: request.url,
217 |         viewCount,
218 |         date: uploadDate,
219 |         likes: likesCount,
220 |         dislikes: null,
221 |         channelName,
222 |         channelUrl,
223 |         numberOfSubscribers,
224 |         duration: durationStr,
225 |         commentsCount: commentsCount ? parseInt(commentsCount.replace(/\D/g, ''), 10) : null,
226 |         details: description,
227 |         text,
228 |         subtitles,
229 |         comments,
230 |         commentsTurnedOff,
231 |     }, { page, request });
232 | };
233 | 
234 | /**
235 |  * @param {{
236 |  * page: Puppeteer.Page
237 |  * maxRequested: number
238 |  * isSearchResultPage: boolean
239 |  * input: object
240 |  * requestUrl: string
241 |  * }} basicInfoParams
242 |  */
243 | 
244 | const getBasicInformation = async (basicInfoParams) => {
245 |     const { page, maxRequested, isSearchResultPage, input, requestUrl } = basicInfoParams;
246 |     const { youtubeVideosSection, youtubeVideosRenderer, url, videoTitle, channelNameText, subscriberCount, canonicalUrl,
247 |         simlifiedResultChannelUrl, simplifiedResultChannelName, simplifiedResultDate, simplifiedResultDurationText, simplifiedResultVideoTitle, simplifiedResultViewCount,
248 |     } = CONSTS.SELECTORS.SEARCH;
249 | 
250 |     const extendOutputFunction = await utils.extendFunction({
251 |         input,
252 |         key: 'extendOutputFunction',
253 |         output: async (data) => {
254 |             await Apify.pushData(data);
255 |         },
256 |         helpers: {},
257 |     });
258 | 
259 |     log.debug('loadVideosUrls', { maxRequested });
260 |     let shouldContinue = true;
261 |     let videoAmount = 0;
262 | 
263 |     const logInterval = setInterval(
264 |         () => log.info(`Scrolling state - Pushed ${videoAmount} unique videos total`),
265 |         60000,
266 |     );
267 | 
268 |     let channelUrl; let numberOfSubscribers; let
269 |         channelName;
270 | 
271 |     if (requestUrl.includes('/channel/') || (requestUrl.includes('/user/') && requestUrl.includes('videos'))) {
272 |         channelUrl = await page.$eval(canonicalUrl, (el) => el.href);
273 |         const subscribersStr = await page.$eval(subscriberCount, (el) => el.innerText.replace(/subscribers/ig, '').trim());
274 |         numberOfSubscribers = unformatNumbers(subscribersStr);
275 |         channelName = (await page.$eval(channelNameText, (el) => el.innerText)).trim();
276 |     }
277 | 
278 |     try {
279 |         while (shouldContinue) { // eslint-disable-line no-constant-condition
280 |             // youtube keep adding video sections to the page on scroll
281 |             await page.waitForSelector(youtubeVideosSection);
282 |             const videoSections = await page.$$(youtubeVideosSection);
283 | 
284 |             log.debug('Video sections', { shouldContinue, videoSections: videoSections.length });
285 |             let videoCount = 0;
286 | 
287 |             for (const videoSection of videoSections) {
288 |                 // each section have around 20 videos
289 |                 await page.waitForSelector(youtubeVideosRenderer);
290 |                 const videos = await videoSection.$$(youtubeVideosRenderer);
291 | 
292 |                 log.debug('Videos count', { shouldContinue, videos: videos.length });
293 | 
294 |                 for (const video of videos) {
295 |                     let title;
296 |                     try {
297 |                         await video.hover();
298 |                     } catch (e) {}
299 | 
300 |                     if (channelUrl) {
301 |                         const videoUrl = await video.$eval(url, (el) => el.href);
302 |                         const videoId = utils.getVideoId(videoUrl);
303 |                         title = await video.$eval(videoTitle, (el) => el.title);
304 |                         const videoDetails = await video.$eval(videoTitle, (el) => el.ariaLabel) || '';
305 |                         //videoDetailsArray exmpl: ["by", "Bloomberg", "Quicktake:", "Now", "3", "hours", "ago", "1", "minute", "1,453", "views"]
306 |                         const videoDetailsArray = videoDetails.replace(title, ``).replace(`by ${channelName}`, ``).split(' ').filter((item) => item);
307 |                         let simplifiedDate = videoDetailsArray.slice(0, videoDetailsArray.indexOf('ago') + 1) // "date" info is always before 'ago'
308 |                             .slice(-3).join(' ');
309 |                         const viewCount = +videoDetailsArray[videoDetailsArray.length - 2].replace(/\D/g, '');
310 |                         let durationRaw = videoDetailsArray.slice(6, videoDetailsArray.length - 2).join(' ');
311 | 
312 |                         let duration;
313 |                         let isError = false;
314 | 
315 |                         try {
316 |                             duration = await video.$eval(`span[aria-label="${durationRaw}"]`, (el) => el.innerText);
317 |                         } catch (e) {
318 |                             console.log(`Couldn't parse duration, sending the raw duration`);
319 |                             isError = true;
320 |                         }
321 | 
322 | 
323 |                         // second attempt to get the duration from the alternative version
324 |                         if (isError) {
325 |                             try {
326 |                                 console.log(`Trying to parse alternative duration`);
327 |                                 durationRaw = videoDetailsArray.slice(videoDetailsArray.indexOf('ago') + 1, -2).join(' ');
328 |                                 duration = await video.$eval(`span[aria-label="${durationRaw}"]`, (el) => el.innerText);
329 |                             } catch (e) {
330 |                                 console.log(`Couldn't parse duration, sending the raw duration`);
331 |                                 duration = durationRaw;
332 |                             }
333 |                         }
334 | 
335 |                         videoAmount++;
336 | 
337 |                         await extendOutputFunction({
338 |                             title,
339 |                             id: videoId,
340 |                             url: videoUrl,
341 |                             viewCount,
342 |                             date: simplifiedDate,
343 |                             channelName,
344 |                             channelUrl,
345 |                             numberOfSubscribers,
346 |                             duration,
347 |                         });
348 |                     } else {
349 |                         try {
350 |                             title = await video.$eval(simplifiedResultVideoTitle, (el) => el.innerText);
351 |                             const videoUrl = await video.$eval(simplifiedResultVideoTitle, (el) => el.href);
352 |                             const duration = await video.$eval(simplifiedResultDurationText, (el) => el.innerText);
353 |                             const channelName = (await page.$eval(channelNameText, (el) => el.innerText)).trim(); // previously was => simplifiedResultChannelName
354 |                             const channelUrl = await page.$eval(canonicalUrl, (el) => el.href); // previously was => simlifiedResultChannelUrl
355 |                             const viewCountRaw = await video.$eval(simplifiedResultViewCount, (el) => el.innerText);
356 |                             const viewCount = unformatNumbers(viewCountRaw);
357 |                             const date = await video.$eval(simplifiedResultDate, (el) => el.innerText);
358 | 
359 |                             videoAmount++;
360 | 
361 |                             await extendOutputFunction({
362 |                                 title,
363 |                                 id: videoUrl.split('v=')[1],
364 |                                 url: videoUrl,
365 |                                 viewCount,
366 |                                 date,
367 |                                 channelName,
368 |                                 channelUrl,
369 |                                 duration,
370 |                             });
371 |                         } catch (e) {
372 |                             log.warning(e);
373 |                         }
374 |                     }
375 | 
376 |                     if (videoAmount >= maxRequested) {
377 |                         shouldContinue = false;
378 |                         break;
379 |                     }
380 | 
381 |                     await sleep(CONSTS.DELAY.HUMAN_PAUSE.MAX);
382 | 
383 |                     if (!isSearchResultPage) {
384 |                         // remove the link on channels, so the scroll happens
385 |                         await video.evaluate((el) => el.remove());
386 |                     }
387 | 
388 |                     videoCount++;
389 | 
390 |                     log.info(`Adding simplified video data: ${title}`);
391 | 
392 |                     await sleep(CONSTS.DELAY.START_LOADING_MORE_VIDEOS);
393 |                 }
394 | 
395 |                 if (!shouldContinue) {
396 |                     break;
397 |                 }
398 |             }
399 | 
400 |             if (!videoCount) {
401 |                 shouldContinue = false;
402 |                 break;
403 |             }
404 |         }
405 |     } catch (e) {
406 |         clearInterval(logInterval);
407 |         log.warning(e);
408 |     }
409 |     clearInterval(logInterval);
410 | };
411 | 


--------------------------------------------------------------------------------
/src/utility.js:
--------------------------------------------------------------------------------
  1 | const moment = require('moment');
  2 | const Apify = require('apify');
  3 | const Puppeteer = require('puppeteer'); // eslint-disable-line
  4 | const vm = require('vm');
  5 | 
  6 | const { log, sleep } = Apify.utils;
  7 | 
  8 | const CONSTS = require('./consts');
  9 | 
 10 | exports.handleErrorAndScreenshot = async (page, e, errorName) => {
 11 |     await Apify.utils.puppeteer.saveSnapshot(page, { key: `ERROR-${errorName}-${Math.random()}` });
 12 |     throw `Error: ${errorName} - Raw error: ${e.message}`;
 13 | };
 14 | 
 15 | /**
 16 |  * @param {{
 17 |  *  requestQueue: Apify.RequestQueue,
 18 |  *  page: Puppeteer.Page,
 19 |  *  maxRequested: number,
 20 |  *  isSearchResultPage: boolean,
 21 |  *  searchOrUrl: string,
 22 |  * }} loadVideosUrlsParams
 23 |  */
 24 | 
 25 | exports.loadVideosUrls = async (loadVideosUrlsParams) => {
 26 |     const { requestQueue, page, maxRequested, isSearchResultPage, searchOrUrl } = loadVideosUrlsParams;
 27 |     const { youtubeVideosSection, youtubeVideosRenderer, url } = CONSTS.SELECTORS.SEARCH;
 28 | 
 29 |     log.debug('loadVideosUrls', { maxRequested });
 30 |     let shouldContinue = true;
 31 |     let videosEnqueued = 0;
 32 |     let videosEnqueuedUnique = 0;
 33 | 
 34 |     const logInterval = setInterval(
 35 |         () => log.info(`[${searchOrUrl}]: Scrolling state - Enqueued ${videosEnqueuedUnique} unique video URLs, ${videosEnqueued} total`),
 36 |         60000,
 37 |     );
 38 | 
 39 |     try {
 40 |         while (shouldContinue) { // eslint-disable-line no-constant-condition
 41 |             // youtube keep adding video sections to the page on scroll
 42 |             await page.waitForSelector(youtubeVideosSection);
 43 |             const videoSections = await page.$$(youtubeVideosSection);
 44 | 
 45 |             log.debug('Video sections', { shouldContinue, videoSections: videoSections.length });
 46 |             let videoCount = 0;
 47 | 
 48 |             for (const videoSection of videoSections) {
 49 |                 // each section have around 20 videos
 50 |                 await page.waitForSelector(youtubeVideosRenderer);
 51 |                 const videos = await videoSection.$$(youtubeVideosRenderer);
 52 | 
 53 |                 log.debug('Videos count', { shouldContinue, videos: videos.length });
 54 | 
 55 |                 for (const video of videos) {
 56 |                     try {
 57 |                         await video.hover();
 58 |                     } catch (e) {
 59 |                         log.debug('Video hover error', { e: e.message });
 60 |                     }
 61 | 
 62 |                     // sometimes a single $eval will make this call crash, and will stop the loop
 63 |                     // keeping it as $$eval always return an array, and an empty one when it's not found
 64 |                     const videoUrls = await video.$$eval(url, (els) => els.map((el) => el.href));
 65 | 
 66 |                     if (!videoUrls.length) {
 67 |                         log.debug('Video url not found');
 68 |                         continue; // eslint-disable-line no-continue
 69 |                     }
 70 | 
 71 |                     const rq = await requestQueue.addRequest({
 72 |                         url: videoUrls[0],
 73 |                         userData: { label: 'DETAIL' },
 74 |                     });
 75 | 
 76 |                     videosEnqueued++;
 77 | 
 78 |                     if (!rq.wasAlreadyPresent) {
 79 |                         // count only unique videos
 80 |                         videosEnqueuedUnique++;
 81 |                     }
 82 | 
 83 |                     if (videosEnqueued >= maxRequested) {
 84 |                         shouldContinue = false;
 85 |                         break;
 86 |                     }
 87 | 
 88 |                     await sleep(CONSTS.DELAY.HUMAN_PAUSE.max);
 89 | 
 90 |                     if (!isSearchResultPage) {
 91 |                         // remove the link on channels, so the scroll happens
 92 |                         await video.evaluate((el) => el.remove());
 93 |                     }
 94 | 
 95 |                     videoCount++;
 96 |                 }
 97 | 
 98 |                 await sleep(CONSTS.DELAY.START_LOADING_MORE_VIDEOS);
 99 | 
100 |                 if (isSearchResultPage) {
101 |                     // remove element after extracting result urls. removing it make the page scroll,
102 |                     // and frees up memory. only delete nodes in search results
103 |                     await videoSection.evaluate((el) => el.remove());
104 |                 }
105 | 
106 |                 if (!shouldContinue) {
107 |                     break;
108 |                 }
109 |             }
110 | 
111 |             if (!videoCount) {
112 |                 shouldContinue = false;
113 |                 break;
114 |             }
115 |         }
116 |     } catch (e) {
117 |         clearInterval(logInterval);
118 |         throw e;
119 |     }
120 |     clearInterval(logInterval);
121 |     log.info(`[${searchOrUrl}]: Scrolling finished - Enqueued ${videosEnqueuedUnique} unique video URLs, ${videosEnqueued} total`);
122 | };
123 | 
124 | exports.getDataFromXpath = async (page, xPath, attrib) => {
125 |     await page.waitForXPath(xPath, { timeout: 120000 });
126 |     const xElement = await page.$x(xPath);
127 |     return page.evaluate((el, key) => el[key], xElement[0], attrib);
128 | };
129 | 
130 | exports.getDataFromSelector = async (page, slctr, attrib) => {
131 |     const slctrElem = await page.waitForSelector(slctr, { visible: true, timeout: 60000 });
132 |     return page.evaluate((el, key) => el[key], slctrElem, attrib);
133 | };
134 | 
135 | /**
136 |  * @param {string} url
137 |  */
138 | exports.categorizeUrl = (url) => {
139 |     try {
140 |         const pUrl = new URL(url, 'https://www.youtube.com');
141 | 
142 |         if (!pUrl.hostname.includes('youtube.com')) {
143 |             throw new Error('Invalid youtube url');
144 |         }
145 | 
146 |         let label = 'MASTER';
147 | 
148 |         if (pUrl.searchParams.get('v')) {
149 |             label = 'DETAIL';
150 |         } else if (pUrl.searchParams.get('search_query')) {
151 |             label = 'SEARCH';
152 |         } else if (pUrl.pathname.includes('/channel/') || pUrl.pathname.includes('/user/') || pUrl.pathname.includes('/c/')) {
153 |             label = 'CHANNEL';
154 |         }
155 | 
156 |         return label;
157 |     } catch (e) {
158 |         log.exception(e, 'categorizeUrl', { url });
159 |         return null;
160 |     }
161 | };
162 | 
163 | exports.unformatNumbers = (numStr) => {
164 |     const numberMatch = numStr.replace(/[^0-9,.]/ig, '');
165 |     if (numberMatch) {
166 |         const number = parseFloat(numberMatch.replace(/,/g, ''));
167 |         const multiplierMatch = numStr.match(/(?<=[0-9 ])[mkb]/ig);
168 | 
169 |         if (multiplierMatch) {
170 |             const multiplier = multiplierMatch[0].toUpperCase();
171 |             switch (multiplier) {
172 |                 case 'K': {
173 |                     return Math.round(number * 1000);
174 |                 }
175 |                 case 'M': {
176 |                     return Math.round(number * 1000000);
177 |                 }
178 |                 case 'B': {
179 |                     return Math.round(number * 1000000000);
180 |                 }
181 |                 default: throw new Error('Unhandled multiplier in getExpandedNumbers');
182 |             }
183 |         }
184 | 
185 |         return number;
186 |     }
187 | 
188 |     // some videos may not have likes, views or channel subscribers
189 |     return 0;
190 | };
191 | 
192 | exports.moveMouseToElemXp = async (pptPage, xPath, mouseMoveSteps, name) => {
193 |     const targetElem = await pptPage.waitForXPath(xPath, { visible: true });
194 |     if (targetElem.length > 0) {
195 |         log.debug(`${name} found at ${xPath}`);
196 |     }
197 | 
198 |     const searchBoxRect = await targetElem.boundingBox();
199 |     const { xPos, yPos } = exports.getRandClickPos(searchBoxRect);
200 | 
201 |     // pause like real user
202 |     await Apify.utils.sleep(exports.getDelayMs(CONSTS.DELAY.HUMAN_PAUSE));
203 | 
204 |     // move mouse to target
205 |     await pptPage.mouse.move(xPos, yPos, { steps: mouseMoveSteps });
206 | };
207 | 
208 | exports.moveMouseToCenterScreen = async (pptPage, mouseMoveSteps) => {
209 |     const viewPort = await pptPage.viewport();
210 |     const { width, height } = viewPort;
211 | 
212 |     const xPos = Math.ceil(width / 2);
213 |     const yPos = Math.ceil(height / 2);
214 | 
215 |     await pptPage.mouse.move(xPos, yPos, { steps: mouseMoveSteps });
216 | };
217 | 
218 | exports.clickHoveredElem = async (pptPage, xPath) => {
219 |     log.debug(`clicking on ${xPath}`);
220 |     await pptPage.mouse.down();
221 |     await Apify.utils.sleep(exports.getDelayMs(CONSTS.DELAY.MOUSE_CLICK));
222 |     await pptPage.mouse.up();
223 | };
224 | 
225 | /**
226 |  * @param {Puppeteer.Page} pptPage
227 |  * @param {string[]} keywords
228 |  */
229 | exports.doTextInput = async (pptPage, keywords) => {
230 |     for (let i = 0; i < keywords.length; i++) {
231 |         await pptPage.type('input#search', keywords[i], { delay: CONSTS.DELAY.BTWN_KEY_PRESS.max });
232 |         await Apify.utils.sleep(exports.getDelayMs(CONSTS.DELAY.BTWN_KEY_PRESS));
233 |     }
234 | };
235 | 
236 | exports.getCutoffDate = (historyString) => {
237 |     // input should have been validated
238 |     // in format 'x minutes/hours/days/weeks/months/year ago'
239 |     const matchNumbers = historyString.match(new RegExp('[0-9]+', 'ig'));
240 |     const numDurations = parseInt(matchNumbers[0], 10);
241 | 
242 |     const matchDuration = historyString.match(new RegExp('(minute|hour|day|week|month|year)', 'ig'));
243 |     const durationStr = matchDuration[0].toLowerCase();
244 |     const durationType = `${durationStr}s`;
245 | 
246 |     return moment().subtract(numDurations, durationType);
247 | };
248 | 
249 | /**
250 |  * @param {string} postsFromDate
251 |  */
252 | exports.isDateInputValid = (postsFromDate) => {
253 |     if (postsFromDate) {
254 |         const matches = postsFromDate.match(/(^(1|([^0a-zA-Z ][0-9]{0,3})) (minute|hour|day|week|month|year))s?/ig);
255 |         return !!matches;
256 |     }
257 | 
258 |     return false;
259 | };
260 | 
261 | /**
262 |  * @param {string} postsFromDate
263 |  */
264 | exports.getYoutubeDateFilters = (postsFromDate) => {
265 |     if (!exports.isDateInputValid(postsFromDate)) {
266 |         return [];
267 |     }
268 | 
269 |     const now = moment();
270 |     const then = exports.getCutoffDate(postsFromDate);
271 |     const duration = moment.duration(now.diff(then));
272 | 
273 |     const dateFilterMap = [
274 |         { inTheLast: 'years', filter: null },
275 |         { inTheLast: 'months', filter: 'This year' },
276 |         { inTheLast: 'weeks', filter: 'This month' },
277 |         { inTheLast: 'days', filter: 'This week' },
278 |         { inTheLast: 'hours', filter: 'Today' },
279 |         { inTheLast: 'minutes', filter: 'Last hour' },
280 |     ];
281 | 
282 |     const youtubeFilters = [];
283 |     // start with the longest duration
284 |     for (let i = 0; i < dateFilterMap.length; i++) {
285 |         const durType = dateFilterMap[i].inTheLast;
286 |         const durCount = duration.as(durType);
287 | 
288 |         // e.g '2 days ago' results in 'This week' youtube filter being set
289 |         if (durCount > 1) {
290 |             if (dateFilterMap[i].filter) {
291 |                 youtubeFilters.push(dateFilterMap[i].filter);
292 |             }
293 |             break;
294 |         }
295 |     }
296 | 
297 |     if (youtubeFilters.length > 0) {
298 |         // if we are using any of the youtube date filters
299 |         // then we must also sort results by 'Upload date'
300 |         youtubeFilters.unshift('Upload date');
301 |     }
302 | 
303 |     return youtubeFilters;
304 | };
305 | 
306 | exports.getVideoId = (videoUrl) => {
307 |     const matches = videoUrl.match(/(?<=[/=])([0-9A-Za-z_-]{10,})[#/]?/ig);
308 |     if (matches && matches.length > 0) {
309 |         for (let i = 0; i < matches.length; i++) {
310 |             const match = matches[i];
311 | 
312 |             // two-stage regex to cover all current types of youtube URLs
313 |             const matches2 = match.match(/[#/]/ig);
314 |             if (!matches2 && match.length <= 12) {
315 |                 return match;
316 |             }
317 |         }
318 |     }
319 | 
320 |     return '';
321 | };
322 | 
323 | exports.getRandClickPos = (rect) => {
324 |     const xBuffer = 0.7;
325 |     const yBuffer = 0.4;
326 | 
327 |     // define an area well within the element's borders
328 |     // we will be mouse-clicking within this region
329 |     const clickableWidth = Math.ceil(xBuffer * (rect.width));
330 |     const clickableHeight = Math.ceil(yBuffer * (rect.height));
331 | 
332 |     const randXoffset = exports.getRandBetween(1, clickableWidth);
333 |     const randYoffset = exports.getRandBetween(1, clickableHeight);
334 | 
335 |     return { xPos: (rect.x + randXoffset), yPos: (rect.y + randYoffset) };
336 | };
337 | 
338 | exports.isErrorStatusCode = (statusCode) => {
339 |     if (statusCode) {
340 |         const statusNum = parseInt(`${statusCode}`, 10);
341 |         return statusNum >= 400;
342 |     }
343 | 
344 |     return false;
345 | };
346 | 
347 | exports.getMaxVideos = (numOfVideos, userMaximum) => {
348 |     if (userMaximum > 0) {
349 |         if (userMaximum > numOfVideos) {
350 |             log.info(`user requested ${userMaximum} videos but only ${numOfVideos} were loaded`);
351 |             return numOfVideos;
352 |         }
353 | 
354 |         return userMaximum;
355 |     }
356 | 
357 |     // no user imposed limit
358 |     return numOfVideos;
359 | };
360 | 
361 | exports.getRandBetween = (min, max) => {
362 |     return Math.floor(Math.random() * (max - min + 1) + min);
363 | };
364 | 
365 | exports.getDelayMs = (minMax) => {
366 |     return exports.getRandBetween(minMax.MIN, minMax.MAX);
367 | };
368 | 
369 | /**
370 |  * @template T
371 |  * @typedef {T & { Apify: Apify, customData: any, request: Apify.Request }} PARAMS
372 |  */
373 | 
374 | /**
375 |  * Compile a IO function for mapping, filtering and outputing items.
376 |  * Can be used as a no-op for interaction-only (void) functions on `output`.
377 |  * Data can be mapped and filtered twice.
378 |  *
379 |  * Provided base map and filter functions is for preparing the object for the
380 |  * actual extend function, it will receive both objects, `data` as the "raw" one
381 |  * and "item" as the processed one.
382 |  *
383 |  * Always return a passthrough function if no outputFunction provided on the
384 |  * selected key.
385 |  *
386 |  * @template RAW
387 |  * @template {{ [key: string]: any }} INPUT
388 |  * @template MAPPED
389 |  * @template {{ [key: string]: any }} HELPERS
390 |  * @param {{
391 |  *  key: string,
392 |  *  map?: (data: RAW, params: PARAMS<HELPERS>) => Promise<MAPPED>,
393 |  *  output?: (data: MAPPED, params: PARAMS<HELPERS> & { data: RAW, item: MAPPED }) => Promise<void>,
394 |  *  filter?: (obj: { data: RAW, item: MAPPED }, params: PARAMS<HELPERS>) => Promise<boolean>,
395 |  *  input: INPUT,
396 |  *  helpers: HELPERS,
397 |  * }} params
398 |  * @return {Promise<(data: RAW, args?: Record<string, any>) => Promise<void>>}
399 |  */
400 | const extendFunction = async ({
401 |     key,
402 |     output,
403 |     filter,
404 |     map,
405 |     input,
406 |     helpers,
407 | }) => {
408 |     /**
409 |      * @type {PARAMS<HELPERS>}
410 |      */
411 |     const base = {
412 |         ...helpers,
413 |         Apify,
414 |         customData: input.customData || {},
415 |     };
416 | 
417 |     const evaledFn = (() => {
418 |         // need to keep the same signature for no-op
419 |         if (typeof input[key] !== 'string' || input[key].trim() === '') {
420 |             return new vm.Script('({ item }) => item');
421 |         }
422 | 
423 |         try {
424 |             return new vm.Script(input[key], {
425 |                 lineOffset: 0,
426 |                 produceCachedData: false,
427 |                 displayErrors: true,
428 |                 filename: `${key}.js`,
429 |             });
430 |         } catch (e) {
431 |             throw new Error(`"${key}" parameter must be a function`);
432 |         }
433 |     })();
434 | 
435 |     /**
436 |      * Returning arrays from wrapper function split them accordingly.
437 |      * Normalize to an array output, even for 1 item.
438 |      *
439 |      * @param {any} value
440 |      * @param {any} [args]
441 |      */
442 |     const splitMap = async (value, args) => {
443 |         const mapped = map ? await map(value, args) : value;
444 | 
445 |         if (!Array.isArray(mapped)) {
446 |             return [mapped];
447 |         }
448 | 
449 |         return mapped;
450 |     };
451 | 
452 |     return async (data, args) => {
453 |         const merged = { ...base, ...args };
454 | 
455 |         for (const item of await splitMap(data, merged)) {
456 |             if (filter && !(await filter({ data, item }, merged))) {
457 |                 continue; // eslint-disable-line no-continue
458 |             }
459 | 
460 |             const result = await (evaledFn.runInThisContext()({
461 |                 ...merged,
462 |                 data,
463 |                 item,
464 |             }));
465 | 
466 |             for (const out of (Array.isArray(result) ? result : [result])) {
467 |                 if (output) {
468 |                     if (out !== null) {
469 |                         await output(out, { ...merged, data, item });
470 |                     }
471 |                     // skip output
472 |                 }
473 |             }
474 |         }
475 |     };
476 | };
477 | 
478 | exports.extendFunction = extendFunction;
479 | 
480 | /**
481 |  * Do a generic check when using Apify Proxy
482 |  *
483 |  * @typedef params
484 |  * @property {any} [params.proxyConfig] Provided apify proxy configuration
485 |  * @property {boolean} [params.required] Make the proxy usage required when running on the platform
486 |  * @property {string[]} [params.blacklist] Blacklist of proxy groups, by default it's ['GOOGLE_SERP']
487 |  * @property {boolean} [params.force] By default, it only do the checks on the platform. Force checking regardless where it's running
488 |  * @property {string[]} [params.hint] Hint specific proxy groups that should be used, like SHADER or RESIDENTIAL
489 |  *
490 |  * @param {params} params
491 |  * @returns {Promise<Apify.ProxyConfiguration | undefined>}
492 |  */
493 | module.exports.proxyConfiguration = async ({
494 |     proxyConfig,
495 |     required = true,
496 |     force = Apify.isAtHome(),
497 |     blacklist = ['GOOGLESERP'],
498 |     hint = [],
499 | }) => {
500 |     const configuration = await Apify.createProxyConfiguration(proxyConfig);
501 | 
502 |     // this works for custom proxyUrls
503 |     if (Apify.isAtHome() && required) {
504 |         if (!configuration
505 |             || (!configuration.usesApifyProxy
506 |                 && (!configuration.proxyUrls || !configuration.proxyUrls.length)) || !configuration.newUrl()) {
507 |             throw '\n=======\nWrong Input! You must use Apify proxy or custom proxies with this scraper!\n\n=======';
508 |         }
509 |     }
510 | 
511 |     // check when running on the platform by default
512 |     if (force) {
513 |         // only when actually using Apify proxy it needs to be checked for the groups
514 |         if (configuration && configuration.usesApifyProxy) {
515 |             if (blacklist.some((blacklisted) => (configuration.groups || []).includes(blacklisted))) {
516 |                 throw '\n=======\nThese proxy groups cannot be used in this actor.'
517 |                     + `Choose other group or contact support@apify.com to give you proxy trial:\n\n*  ${blacklist.join('\n*  ')}\n\n=======`;
518 |             }
519 | 
520 |             // specific non-automatic proxy groups like RESIDENTIAL, not an error, just a hint
521 |             if (hint.length && !hint.some((group) => (configuration.groups || []).includes(group))) {
522 |                 Apify.utils.log.info(`\n=======\nYou can pick specific proxy groups for better experience:\n\n*  ${hint.join('\n*  ')}\n\n=======`);
523 |             }
524 |         }
525 |     }
526 | 
527 |     return configuration;
528 | };
529 | /**
530 |  * Scrape video comments from video detail page.
531 |  * @param {Puppeteer.Page} page
532 |  * @param {number} maxComments: Maximum number of comments to scrape.
533 |  * @returns {Promise<*>}
534 |  */
535 | module.exports.getVideoComments = async (page, maxComments) => {
536 |     // This is copied from SDK - We needed to add stopScrollCallback function parameter to quit scrolling when we have
537 |     // enough comments scraped. This should be replaced by Apify.utils.puppeteer.infiniteScroll when the SDK will be
538 |     // updated by stopScrollCallback feature.
539 |     const infiniteScroll = async (page, options = {}) => {
540 |         const { timeoutSecs = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback } = options;
541 |         let finished;
542 |         const startTime = Date.now();
543 |         const CHECK_INTERVAL_MILLIS = 1000;
544 |         const SCROLL_HEIGHT_IF_ZERO = 10000;
545 |         const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
546 |         const resourcesStats = {
547 |             newRequested: 0,
548 |             oldRequested: 0,
549 |             matchNumber: 0,
550 |         };
551 |         page.on('request', (msg) => {
552 |             if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType())) {
553 |                 resourcesStats.newRequested++;
554 |             }
555 |         });
556 |         const checkFinished = setInterval(() => {
557 |             if (resourcesStats.oldRequested === resourcesStats.newRequested) {
558 |                 resourcesStats.matchNumber++;
559 |                 if (resourcesStats.matchNumber >= waitForSecs) {
560 |                     clearInterval(checkFinished);
561 |                     finished = true;
562 |                     return;
563 |                 }
564 |             } else {
565 |                 resourcesStats.matchNumber = 0;
566 |                 resourcesStats.oldRequested = resourcesStats.newRequested;
567 |             }
568 |             // check if timeout has been reached
569 |             if (timeoutSecs !== 0 && (Date.now() - startTime) / 1000 > timeoutSecs) {
570 |                 clearInterval(checkFinished);
571 |                 finished = true;
572 |             }
573 |         }, CHECK_INTERVAL_MILLIS);
574 |         const doScroll = async () => {
575 |             /* istanbul ignore next */
576 |             await page.evaluate(async (scrollHeightIfZero) => {
577 |                 const delta = document.body.scrollHeight === 0 ? scrollHeightIfZero : document.body.scrollHeight;
578 |                 window.scrollBy(0, delta);
579 |             }, SCROLL_HEIGHT_IF_ZERO);
580 |         };
581 |         const maybeClickButton = async () => {
582 |             const button = await page.$(buttonSelector);
583 |             // Box model returns null if the button is not visible
584 |             if (button && await button.boxModel()) {
585 |                 await button.click({ delay: 10 });
586 |             }
587 |         };
588 |         while (!finished) {
589 |             await doScroll();
590 |             await page.waitForTimeout(50);
591 |             if (scrollDownAndUp) {
592 |                 await page.evaluate(() => {
593 |                     window.scrollBy(0, -1000);
594 |                 });
595 |             }
596 |             if (buttonSelector) {
597 |                 await maybeClickButton();
598 |             }
599 |             if (stopScrollCallback) {
600 |                 if (await stopScrollCallback()) {
601 |                     break;
602 |                 }
603 |             }
604 |         }
605 |     };
606 | 
607 |     const commentSelector = 'ytd-comment-thread-renderer';
608 |     // Scroll first to load at lease one comment.
609 |     await page.evaluate(() => {
610 |         window.scrollBy(0, 500);
611 |     });
612 |     await page.waitForSelector('ytd-comment-thread-renderer');
613 |     await infiniteScroll(page, { stopScrollCallback: async () => {
614 |         const commentCount = await page.evaluate((commentSelector) => {
615 |             return document.body.querySelectorAll(commentSelector).length;
616 |         }, commentSelector);
617 |         log.debug(`Got ${commentCount}/${maxComments} comments for ${page.url()}`);
618 |         return commentCount >= maxComments;
619 |     } });
620 |     const comments = await page.evaluate((max, commentSelector) => {
621 |         const elements = document.body.querySelectorAll(commentSelector);
622 |         const a = [];
623 |         for (let i = 0; i < elements.length; i++) {
624 |             const e = elements[i];
625 |             const author = e.querySelector('#author-text > span').innerHTML.trim()
626 |                 .replace(/\\n/g, '');
627 |             if (author) {
628 |                 const text = e.querySelector('#content-text').innerHTML.trim()
629 |                     .replace(/\\n/g, '');
630 |                 a.push({
631 |                     author,
632 |                     comment: text,
633 |                 });
634 |             }
635 |             if (a.length >= max) {
636 |                 break;
637 |             }
638 |         }
639 |         return a;
640 |     }, maxComments, commentSelector);
641 | 
642 |     log.info(`Scraped ${comments.length} comments for video ${page.url()}`);
643 | 
644 |     return comments;
645 | };
646 | 


--------------------------------------------------------------------------------