├── scripts ├── lint ├── test ├── test-watch ├── thumbnails │ ├── Dockerfile │ └── run ├── index.js └── new-config.js ├── .gitignore ├── src ├── pulse.js ├── test-helper.js ├── algolia.js ├── disk-logger.js ├── globals.js ├── algolia.settings.js ├── __tests__ │ ├── fileutils.js │ ├── utils.js │ ├── transformer.js │ ├── language.js │ └── youtube.js ├── fileutils.js ├── utils.js ├── progress.js ├── transformer.js ├── language.js └── youtube.js ├── jest.config.js ├── configs ├── dataxday.js ├── kiwiparty.js ├── config.sample.js ├── saastr.js ├── paris-container-day.js ├── googleio.js ├── __tests__ │ ├── __snapshots__ │ │ ├── usi.js.snap │ │ └── algolia-education.js.snap │ ├── usi.js │ ├── algolia-education.js │ ├── takeoffconference.js │ ├── hackference.js │ ├── dotconferences.js │ ├── algolia-meetups.js │ ├── chatbot_summit.js │ ├── voice_summit.js │ ├── criticalrole.js │ ├── laracon.js │ ├── writethedocs.js │ └── config-helper.js ├── odessajs.js ├── takeoffconference.js ├── hackference.js ├── usi.js ├── algolia-meetups.js ├── algolia-education.js ├── chatbot_summit.js ├── voice_summit.js ├── laracon.js ├── writethedocs.js ├── criticalrole.js ├── dotconferences.js └── config-helper.js ├── .eslintrc.js ├── .babelrc ├── README.md └── package.json /scripts/lint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | set -e 3 | 4 | eslint ./src/*.js ./configs/*.js 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .env 3 | .envrc 4 | google.service-account-file.json 5 | cache 6 | logs 7 | -------------------------------------------------------------------------------- /scripts/test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | jest \ 3 | --config ./jest.config.js \ 4 | --no-cache \ 5 | ./src/ ./configs 6 | -------------------------------------------------------------------------------- /src/pulse.js: -------------------------------------------------------------------------------- 1 | import EventEmitter from 'events'; 2 | const pulse = new EventEmitter(); 3 | 4 | export default pulse; 5 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable import/no-commonjs */ 2 | module.exports = { 3 | bail: true, 4 | resetMocks: true, 5 | restoreMocks: true, 6 | }; 7 | -------------------------------------------------------------------------------- /configs/dataxday.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'dataxday', 3 | playlists: [ 4 | 'PL-Wbj9VN8zDRMzeWZUv0AUEfs-r_t4HyN', // DataXDay'18 5 | ], 6 | }; 7 | -------------------------------------------------------------------------------- /configs/kiwiparty.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'kiwiparty', 3 | playlists: [ 4 | 'PL-U84vmvcJwUdcIOpIDXgNoVhNWLLpxRc', // KiwiParty 2018 5 | ], 6 | }; 7 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: ['algolia', 'algolia/jest'], 3 | rules: { 4 | 'no-console': 0, 5 | 'no-unused-vars': ['error', { argsIgnorePattern: '^_' }], 6 | }, 7 | }; 8 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [["@babel/preset-env", { "targets": { "node": 6 } }]], 3 | "plugins": [ 4 | "dynamic-import-node-sync", 5 | "@babel/plugin-proposal-object-rest-spread" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /configs/config.sample.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: '{{indexName}}', 3 | playlists: ['{{playlistIds}}'], 4 | // transformData(rawRecord, helper) { 5 | // let record = rawRecord; 6 | 7 | // return record; 8 | // } 9 | }; 10 | -------------------------------------------------------------------------------- /configs/saastr.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'saastr', 3 | playlists: [ 4 | 'PLGlmLTbngJa87gZrq0LohHNQnG_a5t760', // 2018 5 | 'PLGlmLTbngJa9fbcOjinh4FZHVYsizzhdX', // 2017 6 | 'PLGlmLTbngJa-TjQk_B-qAhrjjNu29ydff', // 2016 7 | ], 8 | }; 9 | -------------------------------------------------------------------------------- /scripts/test-watch: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # --no-watchman is needed otherwise Jest got confused as to which file changed 3 | # and run tests for the wrong files 4 | jest \ 5 | --config ./jest.config.js \ 6 | --watch \ 7 | --no-watchman \ 8 | ./src ./configs 9 | -------------------------------------------------------------------------------- /configs/paris-container-day.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'paris-container-day', 3 | playlists: [ 4 | 'PLTQhofNmqyEf6IUbcCtaUPkHk1nJXnQzL', // Paris Container Day 2017 5 | 'PLTQhofNmqyEe_hSmvYaP_EKpR7m0Bk2Je', // Paris Container Day 2018 6 | ], 7 | }; 8 | -------------------------------------------------------------------------------- /configs/googleio.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'googleio', 3 | playlists: [ 4 | 'PLOU2XLYxmsIInFRc3M44HUTQc3b_YJ4-Y', // 2018 5 | 'PLOU2XLYxmsIKC8eODk_RNCWv3fBcLvMMy', // 2017 6 | 'PLOU2XLYxmsILe6_eGvDN3GyiodoV3qNSC', // 2016 7 | 'PLOU2XLYxmsIKLNUPiFCWVtcO7mZRZ9MmS', // 2015 8 | ], 9 | }; 10 | -------------------------------------------------------------------------------- /configs/__tests__/__snapshots__/usi.js.snap: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`USI transformData Pour faire naître une idée - Cédric Villani, à l'USI 1`] = ` 4 | Object { 5 | "speakers": Array [ 6 | Object { 7 | "name": "Cédric Villani", 8 | }, 9 | ], 10 | "video": Object { 11 | "title": "Pour faire naître une idée", 12 | }, 13 | } 14 | `; 15 | -------------------------------------------------------------------------------- /configs/__tests__/__snapshots__/algolia-education.js.snap: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Algolia Education transformData Algolia Build 101 - Push Data - for Javascript developers 1`] = ` 4 | Object { 5 | "language": "Javascript", 6 | "speakers": Array [], 7 | "video": Object { 8 | "title": "Push Data - for Javascript developers", 9 | }, 10 | } 11 | `; 12 | -------------------------------------------------------------------------------- /configs/odessajs.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'odessajs', 3 | playlists: [ 4 | 'PLUF1zRLAgrPF9ZvT-MJFNdkX4uUi0oPnC', // OdessaJS 2014 5 | 'PLUF1zRLAgrPF5-5Puh3kRpF7Tf1LMGCDv', // OdessaJS 2015 6 | 'PLUF1zRLAgrPET1qRvSeKCraJxsjHZUSjw', // OdessaJS 2016 7 | 'PLUF1zRLAgrPHwKYzXbAprzO3-Ykbq1xkY', // OdessaJS 2017 - talks 8 | 'PLUF1zRLAgrPGnLTqOXYU1Sqy4NNVwAWfd', // OdessaJS'2018 - talks 9 | ], 10 | }; 11 | -------------------------------------------------------------------------------- /src/test-helper.js: -------------------------------------------------------------------------------- 1 | const module = { 2 | /** 3 | * Returns a method to mock the specified module 4 | * @param {Object} moduleToMock The module to mock 5 | * @returns {Function} Function to call with methodName and (optional) return value 6 | **/ 7 | mock(moduleToMock) { 8 | return function(methodName, value) { 9 | return jest.spyOn(moduleToMock, methodName).mockReturnValue(value); 10 | }; 11 | }, 12 | }; 13 | 14 | export default module; 15 | -------------------------------------------------------------------------------- /scripts/thumbnails/Dockerfile: -------------------------------------------------------------------------------- 1 | from ubuntu:18.10 2 | 3 | # Install Python 4 | RUN apt-get update -y && \ 5 | apt-get install -y \ 6 | python \ 7 | python-dev \ 8 | python-pip 9 | 10 | # Install youtube-dl 11 | RUN pip install youtube_dl 12 | 13 | # Install ffmpeg 14 | RUN apt-get install -y ffmpeg 15 | 16 | # Install AWS cli 17 | RUN pip install awscli 18 | 19 | # Put executable script at root 20 | COPY run /root/ 21 | RUN chmod +x /root/run 22 | ENTRYPOINT ["/root/run"] 23 | -------------------------------------------------------------------------------- /configs/takeoffconference.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'takeoffconference', 3 | playlists: [ 4 | 'PLuMK2S9sZg71QqVzwepG-bLBxcJWEzcW9', // 2018 5 | 'PLMz7qMiFSV91TlCtopuwEtoMaPhRx96Tg', // 2014 6 | 'PLMz7qMiFSV93QQUFSDRFWPBcdGHfkySqN', // 2013 7 | ], 8 | transformData(rawRecord, helper) { 9 | let record = rawRecord; 10 | 11 | // Videos all follow the same 12 | record = helper.enrich( 13 | record, 14 | 'video.title', 15 | '{_} - {video.title} - {_speakers_}' 16 | ); 17 | 18 | return record; 19 | }, 20 | }; 21 | -------------------------------------------------------------------------------- /configs/__tests__/usi.js: -------------------------------------------------------------------------------- 1 | import config from '../usi.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('USI', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it("Pour faire naître une idée - Cédric Villani, à l'USI", () => { 12 | const input = { 13 | video: { 14 | title: "Pour faire naître une idée - Cédric Villani, à l'USI", 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toMatchSnapshot(); 21 | }); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /configs/__tests__/algolia-education.js: -------------------------------------------------------------------------------- 1 | import config from '../algolia-education.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('Algolia Education', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('Algolia Build 101 - Push Data - for Javascript developers', () => { 12 | const input = { 13 | video: { 14 | title: 'Algolia Build 101 - Push Data - for Javascript developers', 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toMatchSnapshot(); 21 | }); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /configs/hackference.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'hackference', 4 | playlists: [ 5 | 'PLJK9M6xgJ-uYeAO4rGRB_yDRFTXwVNWQY', // 2017 6 | ], 7 | transformData(rawRecord, helper) { 8 | let record = rawRecord; 9 | 10 | // Finding conference and year from playlist name 11 | record = helper.enrich( 12 | record, 13 | 'playlist.title', 14 | '{conference.name} {conference.year}' 15 | ); 16 | _.update(record, 'conference.year', _.parseInt); 17 | 18 | // Sample: 19 | // Lorna Mitchell - Building a Serverless Data Pipeline #hackference2017 20 | record = helper.enrich( 21 | record, 22 | 'video.title', 23 | '{author.name} - {video.title} #hackference2017' 24 | ); 25 | 26 | return record; 27 | }, 28 | }; 29 | -------------------------------------------------------------------------------- /configs/usi.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'usi', 3 | playlists: [ 4 | 'PLyzb9DL11tdZbjRpEDyP4s1pQsxeFg6x2', // 2017 5 | 'PLyzb9DL11tdYqsgu0kQICQKpt0lMz0Nl5', // 2016 6 | 'PLyzb9DL11tdbBE9jpIm76GPcANwSG7Otf', // 2015 7 | ], 8 | transformData(rawRecord, helper) { 9 | let record = rawRecord; 10 | 11 | // Remove mentions of USI in the title 12 | helper.trimKey( 13 | record, 14 | 'video.title', 15 | ", à l'USI", 16 | ', at USI', 17 | 'USI 2015 - ', 18 | 'USI 2016 - ', 19 | 'USI 2016 : ', 20 | 'USI 2017' 21 | ); 22 | 23 | // Parse title and speaker name 24 | record = helper.enrich( 25 | record, 26 | 'video.title', 27 | '{video.title} - {_speakers_}' 28 | ); 29 | 30 | return record; 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /configs/__tests__/takeoffconference.js: -------------------------------------------------------------------------------- 1 | import config from '../takeoffconference.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('takeoffconference', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('should extract author and title from the title', () => { 12 | const input = { 13 | video: { 14 | title: 'TakeOff 2013 - JSONiq - William Candillon', 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toHaveProperty('speakers', [ 21 | { name: 'William Candillon' }, 22 | ]); 23 | expect(actual).toHaveProperty('video.title', 'JSONiq'); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /src/algolia.js: -------------------------------------------------------------------------------- 1 | import indexing from 'algolia-indexing'; 2 | import _ from 'lodash'; 3 | import globals from './globals'; 4 | import chalk from 'chalk'; 5 | import defaultIndexSettings from './algolia.settings'; 6 | 7 | export default { 8 | run(records) { 9 | const credentials = { 10 | apiKey: globals.algoliaApiKey(), 11 | appId: globals.algoliaAppId(), 12 | indexName: globals.configName(), 13 | }; 14 | 15 | let settings = defaultIndexSettings; 16 | const transformSettings = _.get(globals.config(), 'transformSettings'); 17 | if (transformSettings) { 18 | settings = transformSettings(settings); 19 | } 20 | 21 | console.info(chalk.blue('Pushing to Algolia')); 22 | indexing.verbose(); 23 | indexing.config({ batchMaxSize: 100 }); 24 | indexing.fullAtomic(credentials, records, settings); 25 | }, 26 | }; 27 | -------------------------------------------------------------------------------- /configs/algolia-meetups.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'algolia-meetups', 4 | playlists: [ 5 | 'PLuHdbqhRgWHIosfqQ-9whwXzN5sgY7NAk', // TechLunch 6 | 'PLuHdbqhRgWHJg9eOFCl5dgLvVjd_DFz8O', // Search Party 7 | 'PLuHdbqhRgWHJAnKsYLIYB5MV2Srj2dEz3', // Meetups 8 | ], 9 | transformData(rawRecord, helper) { 10 | const record = rawRecord; 11 | 12 | // Get meetup name from playlist id 13 | const playlistName = _.get(record, 'playlist.title'); 14 | const nameHashes = { 15 | 'TechLunch videos': 'TechLunch', 16 | 'Algolia Search Party': 'Search Party', 17 | Meetups: 'Meetups', 18 | }; 19 | _.set(record, 'conference.name', nameHashes[playlistName]); 20 | 21 | // Get year from published date 22 | const publishedDate = _.get(record, 'video.publishedDate.timestamp'); 23 | _.set(record, 'conference.year', helper.year(publishedDate)); 24 | 25 | return record; 26 | }, 27 | }; 28 | -------------------------------------------------------------------------------- /src/disk-logger.js: -------------------------------------------------------------------------------- 1 | const WRITE_RESPONSE_LOGS = process.env.WRITE_RESPONSE_LOGS; 2 | import fileutils from './fileutils'; 3 | import _ from 'lodash'; 4 | 5 | const module = { 6 | /** 7 | * Log the API return data to disk 8 | * 9 | * @param {String} destination File path to save the file (in the ./logs 10 | * directory) 11 | * @param {Object|String} content Content to store on disk 12 | * @returns {Promise} Write on disk promise 13 | * 14 | * Note that if the content is an object, it will be saved as pretty printed 15 | * JSON, otherwise it will be saved as raw text. 16 | **/ 17 | async write(destination, content) { 18 | if (!WRITE_RESPONSE_LOGS) { 19 | return false; 20 | } 21 | 22 | const writeMethod = _.isObject(content) 23 | ? fileutils.writeJson 24 | : fileutils.write; 25 | const writing = await writeMethod(`./logs/${destination}`, content); 26 | return writing; 27 | }, 28 | }; 29 | 30 | export default module; 31 | -------------------------------------------------------------------------------- /scripts/index.js: -------------------------------------------------------------------------------- 1 | import youtube from '../src/youtube'; 2 | import globals from '../src/globals'; 3 | import transformer from '../src/transformer'; 4 | import progress from '../src/progress'; 5 | import algolia from '../src/algolia'; 6 | import yargs from 'yargs'; 7 | 8 | /** 9 | * Parsing command line arguments 10 | **/ 11 | const argv = yargs 12 | .usage('Usage: yarn index [config]') 13 | .command('$0 config', 'Index the videos of the specified config') 14 | .help(false) 15 | .version(false).argv; 16 | 17 | (async () => { 18 | try { 19 | globals.init(argv.config); 20 | 21 | // Get all video data from YouTube 22 | const videos = await youtube.getVideos(); 23 | progress.displayWarnings(); 24 | 25 | // Transform videos in records 26 | const records = await transformer.run(videos); 27 | progress.displayWarnings(); 28 | 29 | // Push records 30 | await algolia.run(records); 31 | } catch (err) { 32 | console.info(err); 33 | } 34 | })(); 35 | -------------------------------------------------------------------------------- /src/globals.js: -------------------------------------------------------------------------------- 1 | let CONFIG = {}; 2 | let CONFIG_NAME = null; 3 | const ALGOLIA_API_KEY = process.env.ALGOLIA_API_KEY; 4 | const ALGOLIA_APP_ID = process.env.ALGOLIA_APP_ID; 5 | const READ_FROM_CACHE = process.env.READ_FROM_CACHE || false; 6 | const WRITE_RESPONSE_LOGS = process.env.WRITE_RESPONSE_LOGS || false; 7 | const YOUTUBE_API_KEY = process.env.YOUTUBE_API_KEY; 8 | 9 | const globals = { 10 | init(configName) { 11 | CONFIG_NAME = configName; 12 | CONFIG = import(`../configs/${configName}.js`).default; 13 | }, 14 | readFromCache() { 15 | return READ_FROM_CACHE; 16 | }, 17 | writeResponseLogs() { 18 | return WRITE_RESPONSE_LOGS; 19 | }, 20 | config() { 21 | return CONFIG; 22 | }, 23 | configName() { 24 | return CONFIG_NAME; 25 | }, 26 | youtubeApiKey() { 27 | return YOUTUBE_API_KEY; 28 | }, 29 | algoliaAppId() { 30 | return ALGOLIA_APP_ID; 31 | }, 32 | algoliaApiKey() { 33 | return ALGOLIA_API_KEY; 34 | }, 35 | }; 36 | 37 | export default globals; 38 | -------------------------------------------------------------------------------- /scripts/thumbnails/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # Needed environment variables: 4 | # VIDEO_ID: The YouTube video id to target 5 | # AWS_ACCESS_KEY_ID: AWS Access Key 6 | # AWS_SECRET_ACCESS_KEY: AWS Secret Access Key 7 | 8 | # TODO: 9 | # Stop if thumbnails already in S3 10 | 11 | # Work in /tmp/{VIDEO_ID} 12 | cd /tmp 13 | mkdir -p "talksearch/$VIDEO_ID" 14 | cd "./talksearch/$VIDEO_ID" 15 | 16 | # Download the video 17 | youtube-dl \ 18 | --output "video.mp4" \ 19 | --format 133 \ 20 | --continue \ 21 | "$VIDEO_ID" 22 | 23 | # Extract one thumbnail for every minute of video 24 | ffmpeg \ 25 | -i "video.mp4" \ 26 | -vf fps=1/60 \ 27 | "%d.jpg" 28 | 29 | # Push thumbnails to S3 30 | aws s3 \ 31 | cp . \ 32 | "s3://talksearch/thumbnails/${VIDEO_ID}/" \ 33 | --recursive \ 34 | --include "*.jpg" 35 | # video_id="$1" 36 | # bucket_name="pixelastic-talksearch" 37 | # path_tmp="/tmp/talksearch" 38 | # path_destination="${path_tmp}/${video_id}" 39 | 40 | 41 | 42 | ## Push all thumbnails to S3, under the videoId directory 43 | 44 | -------------------------------------------------------------------------------- /configs/__tests__/hackference.js: -------------------------------------------------------------------------------- 1 | import config from '../hackference.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('hackference', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('should extract author and title from the title', () => { 12 | const input = { 13 | video: { 14 | title: 15 | 'Lorna Mitchell - Building a Serverless Data Pipeline #hackference2017', 16 | }, 17 | }; 18 | 19 | const actual = current(input); 20 | 21 | expect(actual).toHaveProperty('author.name', 'Lorna Mitchell'); 22 | expect(actual).toHaveProperty( 23 | 'video.title', 24 | 'Building a Serverless Data Pipeline' 25 | ); 26 | }); 27 | 28 | it('should extract the conference name and year from the playlist', () => { 29 | const input = { 30 | playlist: { 31 | title: 'Hackference 2017', 32 | }, 33 | }; 34 | 35 | const actual = current(input); 36 | 37 | expect(actual).toHaveProperty('conference.name', 'Hackference'); 38 | expect(actual).toHaveProperty('conference.year', 2017); 39 | }); 40 | }); 41 | }); 42 | -------------------------------------------------------------------------------- /configs/algolia-education.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'algolia-education', 4 | playlists: [ 5 | 'PLuHdbqhRgWHIVm1e43_7mKUJw3UIreV84', // Algolia 101 6 | 'PLuHdbqhRgWHJDATsVq_Mrj3NjIRjREiM1', // Discover Algolia 7 | ], 8 | transformData(rawRecord, helper) { 9 | let record = rawRecord; 10 | 11 | // Trim the "Algolia Build 101" from the start 12 | record = helper.trimKey(record, 'video.title', 'Algolia Build 101 - '); 13 | 14 | // Extract the language if one is defined 15 | const videoTitle = _.get(record, 'video.title'); 16 | const matches = videoTitle.match(/for (.*) developers/); 17 | if (!_.isEmpty(matches)) { 18 | const [, language] = matches; 19 | _.set(record, 'language', language); 20 | } 21 | 22 | // Remove the speakers 23 | _.set(record, 'speakers', []); 24 | 25 | return record; 26 | }, 27 | transformSettings(rawSettings) { 28 | const settings = rawSettings; 29 | 30 | // Adding custom faceting on the language 31 | const attributesForFaceting = _.get(rawSettings, 'attributesForFaceting'); 32 | attributesForFaceting.push('language'); 33 | _.set(settings, 'attributesForFaceting', attributesForFaceting); 34 | 35 | return settings; 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /configs/chatbot_summit.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'chatbot_summit', 4 | playlists: [ 5 | 'PLTr6zBI1qE6ZJLibC66IsVpfkW9LGC6j_', // 2018 6 | 'PLTr6zBI1qE6YLYSi05CYy3O5qYy_M9oze', // 2017 7 | ], 8 | transformData(rawRecord, helper) { 9 | let record = rawRecord; 10 | 11 | // Remove conference name from video titles 12 | record = helper.trimKey( 13 | record, 14 | 'video.title', 15 | 'The 2nd International Chatbot Summit', 16 | 'Chatbot Summit Tel Aviv 2018', 17 | 'Chatbot Summit Berlin 2017' 18 | ); 19 | 20 | // Remove speaker name from titles 21 | const speakerNames = _.map(_.get(record, 'speakers'), 'name'); 22 | let videoTitle = _.get(record, 'video.title'); 23 | if (speakerNames.length === 1) { 24 | _.each(speakerNames, speakerName => { 25 | videoTitle = _.replace(videoTitle, `${speakerName} //`, ''); 26 | videoTitle = _.replace(videoTitle, `// ${speakerName}`, ''); 27 | }); 28 | } 29 | 30 | // remove other cruft 31 | videoTitle = _.replace(videoTitle, '| |', '|'); 32 | videoTitle = _.trim(videoTitle, '/|'); 33 | videoTitle = _.trim(videoTitle); 34 | _.set(record, 'video.title', videoTitle); 35 | 36 | return record; 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/algolia.settings.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | // We manually disable typo on years 3 | const yearsTypoDisabled = _.times(60, year => `${1970 + year}`); 4 | 5 | const module = { 6 | searchableAttributes: [ 7 | 'unordered(video.title)', 8 | 'unordered(speakers.name)', 9 | 'unordered(caption.content)', 10 | 'unordered(conference.name)', 11 | ], 12 | customRanking: [ 13 | 'desc(video.hasCaptions)', 14 | 'desc(video.popularity.score)', 15 | 'desc(video.hasManualCaptions)', 16 | 'desc(video.publishedDate.day)', 17 | 'desc(video.duration.minutes)', 18 | 'asc(video.positionInPlaylist)', 19 | 'asc(caption.start)', 20 | ], 21 | attributesForFaceting: [ 22 | 'speakers.name', 23 | 'conference.name', 24 | 'conference.year', 25 | 'video.hasManualCaptions', 26 | 'video.id', 27 | 'video.languageCode', 28 | 'caption.languageCode', 29 | 'playlist.id', 30 | 'playlist.title', 31 | 'channel.id', 32 | 'channel.title', 33 | ], 34 | attributesToSnippet: ['caption.content:8'], 35 | distinct: true, 36 | attributeForDistinct: 'video.id', 37 | highlightPreTag: '', 38 | highlightPostTag: '', 39 | advancedSyntax: true, 40 | disableTypoToleranceOnWords: yearsTypoDisabled, 41 | }; 42 | 43 | export default module; 44 | -------------------------------------------------------------------------------- /configs/__tests__/dotconferences.js: -------------------------------------------------------------------------------- 1 | import config from '../dotconferences.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('dotconferences', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('should extract the conference name from the playlist', () => { 12 | const input = { 13 | playlist: { 14 | title: 'dotJS 2017', 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toHaveProperty('conference.name', 'dotJS'); 21 | }); 22 | 23 | it('should extract title and speaker information', () => { 24 | const input = { 25 | video: { 26 | title: 'dotJS 2013 - Remy Sharp - iframe abuse', 27 | }, 28 | }; 29 | 30 | const actual = current(input); 31 | 32 | expect(actual).toHaveProperty('speakers', [{ name: 'Remy Sharp' }]); 33 | expect(actual).toHaveProperty('video.title', 'iframe abuse'); 34 | }); 35 | 36 | it('should keep the title as-is if not following the pattern', () => { 37 | const input = { 38 | video: { 39 | title: 'A day at dotJS 2017', 40 | }, 41 | }; 42 | 43 | const actual = current(input); 44 | 45 | expect(actual).toHaveProperty('video.title', 'A day at dotJS 2017'); 46 | }); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /configs/voice_summit.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'voice_summit', 4 | playlists: [ 5 | 'PLn51IO3rbkV1E1a6WjgvFtW3VaOCRxzov', // VOICE Summit 2018: Speakers 6 | ], 7 | transformData(rawRecord, helper) { 8 | const record = rawRecord; 9 | 10 | function capitalizeName(speakerName) { 11 | return _.map(_.words(speakerName), _.capitalize).join(' '); 12 | } 13 | 14 | const isPanel = _.startsWith(record.video.title, 'panel'); 15 | 16 | // Cleaning title 17 | helper.trimKey(record, 'video.title', 'keynote', 'panel', 'enterprise-'); 18 | let originalTitle = _.get(record, 'video.title'); 19 | originalTitle = _.trimEnd(originalTitle, '-'); 20 | 21 | const split = helper.split(originalTitle, '- '); 22 | let videoTitle; 23 | let speakers; 24 | 25 | // Panels 26 | if (isPanel) { 27 | videoTitle = _.capitalize(split[0]); 28 | speakers = _.map(_.split(split[1], ','), speakerName => ({ 29 | name: capitalizeName(speakerName), 30 | })); 31 | } 32 | 33 | if (!isPanel) { 34 | videoTitle = _.map(_.slice(split, 1), _.capitalize).join(' - '); 35 | 36 | speakers = _.map(helper.split(split[0], 'and', ','), speakerName => ({ 37 | name: capitalizeName(speakerName), 38 | })); 39 | } 40 | 41 | _.set(record, 'video.title', videoTitle); 42 | _.set(record, 'speakers', speakers); 43 | 44 | return record; 45 | }, 46 | }; 47 | -------------------------------------------------------------------------------- /configs/__tests__/algolia-meetups.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | import config from '../algolia-meetups.js'; 3 | import helper from '../config-helper.js'; 4 | 5 | describe('Algolia Meetups', () => { 6 | describe('transformData', () => { 7 | let current; 8 | beforeEach(() => { 9 | current = input => config.transformData(input, helper); 10 | }); 11 | 12 | it('sets the year to 2017', () => { 13 | const input = {}; 14 | _.set(input, 'video.publishedDate.timestamp', 1490979292); 15 | 16 | const actual = current(input); 17 | 18 | expect(actual).toHaveProperty('conference.year', 2017); 19 | }); 20 | 21 | it('sets the conference name to TechLunch', () => { 22 | const input = {}; 23 | _.set(input, 'playlist.title', 'TechLunch videos'); 24 | 25 | const actual = current(input); 26 | 27 | expect(actual).toHaveProperty('conference.name', 'TechLunch'); 28 | }); 29 | 30 | it('sets the conference name to Search Party', () => { 31 | const input = {}; 32 | _.set(input, 'playlist.title', 'Algolia Search Party'); 33 | 34 | const actual = current(input); 35 | 36 | expect(actual).toHaveProperty('conference.name', 'Search Party'); 37 | }); 38 | 39 | it('sets the conference name to Meetups', () => { 40 | const input = {}; 41 | _.set(input, 'playlist.title', 'Meetups'); 42 | 43 | const actual = current(input); 44 | 45 | expect(actual).toHaveProperty('conference.name', 'Meetups'); 46 | }); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /configs/laracon.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | export default { 3 | indexName: 'laracon', 4 | playlists: [ 5 | 'PLMdXHJK-lGoB-CIVsiQt0WU8WcYrb5eoe', // Laracon EU 2013 - Full Playlist 6 | 'PLMdXHJK-lGoCYhxlU3OJ5bOGhcKtDMkcN', // Laracon EU 2014 - Full Playlist 7 | 'PLMdXHJK-lGoA9SIsuFy0UWL8PZD1G3YFZ', // Laracon EU 2015 - Full Playlist 8 | 'PLMdXHJK-lGoCMkOxqe82hOC8tgthqhHCN', // Laracon EU 2016 - Full Playlist 9 | 'PLMdXHJK-lGoBFZgG2juDXF6LiikpQeLx2', // Laracon EU 2017 - Full Playlist 10 | ], 11 | transformData(rawRecord, helper) { 12 | let record = rawRecord; 13 | 14 | // Get the place and year for the playlist 15 | record = helper.enrich( 16 | record, 17 | 'playlist.title', 18 | 'Laracon {conference.year} - Full Playlist' 19 | ); 20 | 21 | // 2013 22 | if (_.get(record, 'conference.year') === 'EU 2013') { 23 | record = helper.enrich( 24 | record, 25 | 'video.title', 26 | '{_speakers_} - {video.title}' 27 | ); 28 | } 29 | 30 | // 2014 31 | if (_.get(record, 'conference.year') === 'EU 2014') { 32 | record = helper.enrich( 33 | record, 34 | 'video.title', 35 | '{_speakers_} - {video.title} at Laracon EU 2014' 36 | ); 37 | } 38 | 39 | // 2015 40 | if (_.get(record, 'conference.year') === 'EU 2015') { 41 | record = helper.enrich( 42 | record, 43 | 'video.title', 44 | '{video.title} - {_speakers_} - {_}' 45 | ); 46 | } 47 | 48 | // 2016-2017 49 | if (_.includes(['EU 2016', 'EU 2017'], _.get(record, 'conference.year'))) { 50 | record = helper.enrich( 51 | record, 52 | 'video.title', 53 | '{_speakers_} - {video.title} - {_}' 54 | ); 55 | } 56 | 57 | return record; 58 | }, 59 | }; 60 | -------------------------------------------------------------------------------- /src/__tests__/fileutils.js: -------------------------------------------------------------------------------- 1 | import module from '../fileutils'; 2 | import helper from '../test-helper'; 3 | const mock = helper.mock(module); 4 | 5 | jest.mock('glob'); 6 | import glob from 'glob'; 7 | jest.mock('fs'); 8 | import fs from 'fs'; 9 | jest.mock('pify'); 10 | import pify from 'pify'; 11 | 12 | describe('fileutils', () => { 13 | describe('glob', () => { 14 | it('is a promise wrapper around glob', async () => { 15 | module._glob = null; 16 | const mockGlob = jest.fn().mockReturnValue('foo'); 17 | pify.mockReturnValue(mockGlob); 18 | 19 | const actual = await module.glob('pattern'); 20 | 21 | expect(actual).toEqual('foo'); 22 | expect(pify).toHaveBeenCalledWith(glob); 23 | expect(mockGlob).toHaveBeenCalledWith('pattern'); 24 | }); 25 | }); 26 | 27 | describe('read', () => { 28 | it('is a promise wrapper around fs.readFile', async () => { 29 | module._readFile = null; 30 | const mockReadFile = jest.fn().mockReturnValue('foo'); 31 | pify.mockReturnValue(mockReadFile); 32 | 33 | const actual = await module.read('filepath'); 34 | 35 | expect(actual).toEqual('foo'); 36 | expect(pify).toHaveBeenCalledWith(fs.readFile); 37 | expect(mockReadFile).toHaveBeenCalledWith('filepath'); 38 | }); 39 | }); 40 | 41 | describe('readJson', () => { 42 | it('should return null if no such file', async () => { 43 | mock('read').mockImplementation(() => { 44 | throw new Error(); 45 | }); 46 | 47 | const actual = await module.readJson(); 48 | 49 | expect(actual).toEqual(null); 50 | }); 51 | 52 | it('should return null if not a Json file', async () => { 53 | mock('read', 'foo'); 54 | 55 | const actual = await module.readJson(); 56 | 57 | expect(actual).toEqual(null); 58 | }); 59 | 60 | it('should parse the JSON content as an object', async () => { 61 | mock('read', '{"foo": "bar"}'); 62 | 63 | const actual = await module.readJson(); 64 | 65 | expect(actual).toHaveProperty('foo', 'bar'); 66 | }); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /configs/writethedocs.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | 3 | export default { 4 | indexName: 'writethedocs', 5 | playlists: [ 6 | 'PLZAeFn6dfHpnHBLE4qEUwg1LjhDZEvC2A', // Write the Docs EU 2014 7 | 'PLZAeFn6dfHplFNTsVdBuHk6vPZbsvHtDw', // Write the Docs Europe 2015 8 | 'PLZAeFn6dfHpnN8fXXHwPtPY33aLGGhYLJ', // Write the Docs Europe 2016 9 | 'PLZAeFn6dfHpkBld-70TsOoYToM3CaTxRC', // Write the Docs Portland 2017 10 | 'PLZAeFn6dfHplBYPCwJt6ItkMDt7JSgUiL', // Write the Docs Prague 2017 11 | 'PLZAeFn6dfHplUgfLOLEuHHAm1HdrIyaZ7', // Write the Docs Portland 2018 12 | ], 13 | transformData(rawRecord, helper) { 14 | let record = rawRecord; 15 | const videoTitle = _.get(record, 'video.title'); 16 | 17 | // Get the place and year for the year 18 | record = helper.enrich( 19 | record, 20 | 'playlist.title', 21 | 'Write the Docs {conference.year}' 22 | ); 23 | 24 | // Keep lightning talks 25 | if (videoTitle && videoTitle.match(/lightning talks/i)) { 26 | return record; 27 | } 28 | 29 | // Portland 2018 30 | if (_.get(record, 'conference.year') === 'Portland 2018') { 31 | record = helper.enrich( 32 | record, 33 | 'video.title', 34 | '{video.title} - {_speakers_} - Write the Docs Portland 2018' 35 | ); 36 | return record; 37 | } 38 | 39 | // Prague 2017 40 | if (_.get(record, 'conference.year') === 'Prague 2017') { 41 | record = helper.enrich( 42 | record, 43 | 'video.title', 44 | 'Write the Docs Prague 2017: {video.title} by {_speakers_}' 45 | ); 46 | return record; 47 | } 48 | 49 | // Portland 2017 50 | if (_.get(record, 'conference.year') === 'Portland 2017') { 51 | record = helper.enrich( 52 | record, 53 | 'video.title', 54 | 'Write the Docs Portland 2017: {video.title} by {_speakers_}' 55 | ); 56 | return record; 57 | } 58 | 59 | // Older conferences 60 | record = helper.enrich( 61 | record, 62 | 'video.title', 63 | '{_speakers_} - {video.title}' 64 | ); 65 | return record; 66 | }, 67 | }; 68 | -------------------------------------------------------------------------------- /configs/criticalrole.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | 3 | export default { 4 | indexName: 'criticalrole', 5 | playlists: [ 6 | 'PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_', // Campaign 1: Vox Machina 7 | 'PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2', // Campaign 2: The Mighty Nein 8 | ], 9 | transformSettings(rawSettings) { 10 | return { 11 | ...rawSettings, 12 | customRanking: [ 13 | 'desc(video.hasCaptions)', 14 | 'asc(video.campaignNumber)', 15 | 'asc(video.episodeNumber)', 16 | 'asc(caption.start)', 17 | ], 18 | attributesForFaceting: [ 19 | 'video.id', 20 | 'caption.languageCode', 21 | 'caption.playerName', 22 | 'playlist.id', 23 | 'playlist.title', 24 | 'channel.id', 25 | 'channel.title', 26 | ], 27 | }; 28 | }, 29 | transformData(rawRecord, helper) { 30 | let record = rawRecord; 31 | 32 | const initialTitle = record.video.title; 33 | 34 | // Campaign 2 35 | if (_.includes(initialTitle, 'Campaign 2')) { 36 | record = helper.enrich( 37 | record, 38 | 'video.title', 39 | '{video.title} | Critical Role | Campaign 2, Episode {video.episodeNumber}' 40 | ); 41 | record.video.campaignNumber = 2; 42 | record.video.episodeNumber = _.parseInt(record.video.episodeNumber); 43 | return record; 44 | } 45 | 46 | // Campaign 1 47 | record.video.campaignNumber = 1; 48 | record = helper.enrich( 49 | record, 50 | 'video.title', 51 | '{_} Episode {video.episodeNumber}' 52 | ); 53 | const episodeNumber = _.get(record, 'video.episodeNumber'); 54 | 55 | const parts = helper.split(initialTitle, '-', '|'); 56 | let videoTitle = parts[0]; 57 | 58 | const episodePartRegexp = new RegExp('[0-9]* pt. (.*)'); 59 | const severalPartsMatch = episodeNumber.match(episodePartRegexp); 60 | if (severalPartsMatch) { 61 | videoTitle = `${videoTitle}, part ${severalPartsMatch[1]}`; 62 | } 63 | 64 | _.set(record, 'video.episodeNumber', _.parseInt(episodeNumber)); 65 | _.set(record, 'video.title', videoTitle); 66 | 67 | return record; 68 | }, 69 | }; 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TalkSearch scraper 2 | 3 | This scraper is a command-line tool that extract information from YouTube 4 | playlists and push them to Algolia. 5 | 6 | ## Usage 7 | 8 | ```shell 9 | yarn index {config_name} 10 | ``` 11 | 12 | ## How it works 13 | 14 | The `./configs/` folder contain custom configs, each containing a list of 15 | playlists to index. 16 | 17 | The command will use the YouTube API to fetch data about the defined playlists 18 | and push them to Algolia. 19 | 20 | Captions will be extracted from the videos if they are available. Each record in 21 | Algolia will represent one caption, also containing a `.video`, `.playlist` and 22 | `.channel` key. The `distinct` feature of Algolia is used to group records of 23 | the same video together, to display the most relevant caption each time. 24 | 25 | Each channel will have its own index called `{channel_name}_{channel_id}`. All 26 | videos of all playlists will be saved in this index, but can be filtered based 27 | on the `channel.id` and `playlist.id` keys of the records. 28 | 29 | ## Development 30 | 31 | Start with `yarn install` to load all the dependencies. 32 | 33 | The project will need `ENV` variables to connect to the services. 34 | 35 | * `ALGOLIA_APP_ID` and `ALGOLIA_API_KEY` for pushing records to Algolia 36 | * `YOUTUBE_API_KEY` to connect to the YouTube API 37 | * `GOOGLE_APPLICATION_CREDENTIALS` that point to the path to your 38 | `google.service-account-file.json` ([create one here][2]) 39 | 40 | We suggest using a tool like [direnv][1] to load those variables through the use 41 | of a `.envrc` file. 42 | 43 | Once everything is installed, you can run `yarn index {config_name}` 44 | 45 | ## Debug calls 46 | 47 | ### `yarn run index:cache` 48 | 49 | This will read data from a disk cache of previous requests instead of making 50 | actual HTTP calls. If there is no cache hit for the request, it will do it for 51 | real. 52 | 53 | This should be the preferred way of running the command for debugging purposes. 54 | 55 | ### `yarn run index:logs` 56 | 57 | This will log all HTTP calls raw responses to disk. This is useful when 58 | debugging, as it allow to dig into the responses of the APIs called. 59 | 60 | [1]: https://direnv.net/ 61 | [2]: https://console.cloud.google.com/apis/credentials/serviceaccountkey 62 | -------------------------------------------------------------------------------- /configs/dotconferences.js: -------------------------------------------------------------------------------- 1 | export default { 2 | indexName: 'dotconferences', 3 | playlists: [ 4 | 'PLMW8Xq7bXrG7LL-bLSweRFmFw7y2HhypC', // dotJS 2018 5 | 'PLMW8Xq7bXrG702XVNfv_zqfFdt-498iV_', // dotCSS 2018 6 | 'PLMW8Xq7bXrG4zEMLdfZTpS9VCKjXeD--h', // dotScale 2018 7 | 'PLMW8Xq7bXrG6M2Nabwt3LuBxZyHVHRZhf', // dotAI 2018 8 | 'PLMW8Xq7bXrG4OC1CZW7m-davg4p4ZCBmZ', // dotSwift 2018 9 | 10 | 'PLMW8Xq7bXrG4gs_BDyI7q009IVDUMQRXB', // dotJS 2017 11 | 'PLMW8Xq7bXrG7acNjsU5YMGl5MMK5gl2vn', // dotGo 2017 12 | 'PLMW8Xq7bXrG7xzLo4j6bDznWzH7ZDc3wx', // dotSecurity 2017 13 | 'PLMW8Xq7bXrG4AcSG9ZcqvMQSp6f0C7mi5', // dotSwift 2017 14 | 'PLMW8Xq7bXrG78Xxnlxov8N_M9mNUN-1Ny', // dotCSS 2017 15 | 'PLMW8Xq7bXrG7fNNYHvpeagKHw4DaUkgud', // dotScale 2017 16 | 'PLMW8Xq7bXrG6-vlD0QFfFf0oi5vtTDcmQ', // dotAI 2017 17 | 18 | 'PLMW8Xq7bXrG6tcAXDsAVATUbrflLOsIG_', // dotGo 2016 19 | 'PLMW8Xq7bXrG7AAvnkys8joKEq8uMGykx7', // dotScale 2016 20 | 'PLMW8Xq7bXrG7XSuKb3M3bSJ4d1XM0Z-gI', // dotCSS 2016 21 | 'PLMW8Xq7bXrG7rZnRaYCel_RJY5yAXLQ2H', // dotJS 2016 22 | 'PLMW8Xq7bXrG4jymjKULrw5_yEvK3uzATe', // dotSecurity 2016 23 | 24 | 'PLMW8Xq7bXrG70G62mxQR0OC4GkUcNLRnC', // dotJS 2015 25 | 'PLMW8Xq7bXrG5kujoYQdw94ip3cnV4WR59', // dotCSS 2015 26 | 'PLMW8Xq7bXrG4Vw-JAnBmqA2IqzM2sf2Na', // dotGo 2015 27 | 'PLMW8Xq7bXrG64KRc6PC0JLWFX2ygzFJDG', // dotScale 2015 28 | 29 | 'PLMW8Xq7bXrG5B_oW-EX8AuLDG0BCwouis', // dotCSS 2014 30 | 'PLMW8Xq7bXrG4bTkovexbhgrcD8BVyHmiS', // dotJS 2014 31 | 'PLMW8Xq7bXrG58Qk-9QSy2HRh2WVeIrs7e', // dotGo 2014 32 | 'PLMW8Xq7bXrG4pl13YVsKkaAUDeLdnrEQZ', // dotScale 2014 33 | 34 | 'PLMW8Xq7bXrG6ZItH9Oq2tceeTS0fjXyii', // dotRB 2013 35 | 'PLMW8Xq7bXrG77SV1VAAiAciRyq3VSC2Gq', // dotJS 2012 36 | 'PLMW8Xq7bXrG486Mh95hKjiXRdci60zUlL', // dotJS 2013 37 | 'PLMW8Xq7bXrG7XGG29sXso2hYYNW_14s_A', // dotScale 2013 38 | ], 39 | transformData(rawRecord, helper) { 40 | let record = rawRecord; 41 | 42 | // Get conference name from the playlist title 43 | record = helper.enrich(record, 'playlist.title', '{conference.name} {_}'); 44 | 45 | // Extract speaker name and video title from title 46 | record = helper.enrich( 47 | record, 48 | 'video.title', 49 | '{_} - {_speakers_} - {video.title}' 50 | ); 51 | 52 | return record; 53 | }, 54 | }; 55 | -------------------------------------------------------------------------------- /configs/__tests__/chatbot_summit.js: -------------------------------------------------------------------------------- 1 | import config from '../chatbot_summit.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('Chatbot Summit', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('2017, Chatbots ready for enterprise', () => { 12 | const input = { 13 | speakers: [{ name: 'Piyush Chandra' }], 14 | video: { 15 | title: 16 | 'Piyush Chandra // Are Chatbots ready for Enterprise? // Chatbot Summit Berlin 2017', 17 | }, 18 | }; 19 | 20 | const actual = current(input); 21 | 22 | expect(actual).toHaveProperty( 23 | 'video.title', 24 | 'Are Chatbots ready for Enterprise?' 25 | ); 26 | }); 27 | 28 | it('2017, The Ethical Beliefs of Machines', () => { 29 | const input = { 30 | speakers: [{ name: 'Nicolai Andersen' }], 31 | video: { 32 | title: 33 | 'Chatbot Summit Berlin 2017 // Nicolai Andersen // The Ethical Beliefs of Machines', 34 | }, 35 | }; 36 | 37 | const actual = current(input); 38 | 39 | expect(actual).toHaveProperty( 40 | 'video.title', 41 | 'The Ethical Beliefs of Machines' 42 | ); 43 | }); 44 | 45 | it('2017, Opening Keynote', () => { 46 | const input = { 47 | video: { 48 | title: 49 | 'Yoav Barel, Founder & CEO Chatbot Summit | The 2nd International Chatbot Summit | Opening Keynote', 50 | }, 51 | }; 52 | 53 | const actual = current(input); 54 | 55 | expect(actual).toHaveProperty( 56 | 'video.title', 57 | 'Yoav Barel, Founder & CEO Chatbot Summit | Opening Keynote' 58 | ); 59 | }); 60 | 61 | it('2017, The Secrets of Bots at Scale', () => { 62 | const input = { 63 | speakers: [{ name: 'Eran Vanounou' }, { name: 'Adam Orentlicher' }], 64 | video: { 65 | title: 66 | 'Eran Vanounou and Adam Orentlicher // The Secrets of Bots at Scale', 67 | }, 68 | }; 69 | 70 | const actual = current(input); 71 | 72 | expect(actual).toHaveProperty( 73 | 'video.title', 74 | 'Eran Vanounou and Adam Orentlicher // The Secrets of Bots at Scale' 75 | ); 76 | }); 77 | }); 78 | }); 79 | -------------------------------------------------------------------------------- /src/fileutils.js: -------------------------------------------------------------------------------- 1 | import stringify from 'json-stable-stringify'; 2 | import _ from 'lodash'; 3 | import fs from 'fs'; 4 | import path from 'path'; 5 | import mkdirpCallback from 'mkdirp'; 6 | import glob from 'glob'; 7 | import pify from 'pify'; 8 | const writeFile = pify(fs.writeFile); 9 | const mkdirp = pify(mkdirpCallback); 10 | 11 | const module = { 12 | /** 13 | * Wrapper around glob() to work as a promise 14 | * @param {String} pattern Glob pattern to match 15 | * @returns {Array} Array of files matching 16 | **/ 17 | async glob(pattern) { 18 | if (!this._glob) { 19 | this._glob = pify(glob); 20 | } 21 | return await this._glob(pattern); 22 | }, 23 | 24 | /** 25 | * Read anyfile on disk 26 | * @param {String} filepath Filepath of the file to read 27 | * @returns {String} Content of the file read 28 | **/ 29 | async read(filepath) { 30 | if (!this._readFile) { 31 | this._readFile = pify(fs.readFile); 32 | } 33 | return await this._readFile(filepath); 34 | }, 35 | 36 | /** 37 | * Read a JSON file on disk and return its parsed content. 38 | * @param {String} source Path to the Json file 39 | * @return {Promise.} The parsed content of the Json file 40 | * Will return null if the file does not exist or is not Json 41 | **/ 42 | async readJson(source) { 43 | try { 44 | const content = await this.read(source); 45 | return JSON.parse(content); 46 | } catch (err) { 47 | return null; 48 | } 49 | }, 50 | 51 | /** 52 | * Write some content to disk 53 | * @param {String} destination Destination filepatth 54 | * @param {String} content Content to write to the file 55 | * @returns {Void} 56 | * Note: It will create the directories if needed 57 | **/ 58 | async write(destination, content) { 59 | await mkdirp(path.dirname(destination)); 60 | await writeFile(destination, content); 61 | return; 62 | }, 63 | 64 | /** 65 | * Writes an object to JSON on disk 66 | * @param {String} destination Filepath to write the file to 67 | * @param {Object} data Object to convert to json and write to disk 68 | * @returns {Void} 69 | **/ 70 | async writeJson(destination, data) { 71 | const content = stringify(data, { space: 2 }); 72 | await this.write(destination, content); 73 | return; 74 | }, 75 | }; 76 | 77 | export default _.bindAll(module, _.functions(module)); 78 | -------------------------------------------------------------------------------- /src/__tests__/utils.js: -------------------------------------------------------------------------------- 1 | import module from '../utils'; 2 | import _ from 'lodash'; 3 | 4 | describe('utils', () => { 5 | describe('eachPair', () => { 6 | it('loop through pair of items', () => { 7 | const input = [1, 2, 3, 4]; 8 | const mockMethod = jest.fn(); 9 | 10 | module.eachPair(input, mockMethod); 11 | 12 | expect(mockMethod.mock.calls[0]).toEqual([1, 2, 0]); 13 | expect(mockMethod.mock.calls[1]).toEqual([3, 4, 1]); 14 | }); 15 | 16 | it('call last element with undefined', () => { 17 | const input = [1, 2, 3]; 18 | const mockMethod = jest.fn(); 19 | 20 | module.eachPair(input, mockMethod); 21 | 22 | expect(mockMethod.mock.calls[0]).toEqual([1, 2, 0]); 23 | expect(mockMethod.mock.calls[1]).toEqual([3, undefined, 1]); 24 | }); 25 | }); 26 | 27 | describe('mapPair', () => { 28 | it('map each pair of item', () => { 29 | const input = [1, 2, 3, 4]; 30 | const method = (a, b, index) => _.sum([a, b, index]); 31 | 32 | const actual = module.mapPair(input, method); 33 | 34 | expect(actual).toEqual([3, 8]); 35 | }); 36 | 37 | it('map each pair, even if not enough elements', () => { 38 | const input = [1, 2, 3]; 39 | const method = (a, b, index) => _.sum([a, b, index]); 40 | 41 | const actual = module.mapPair(input, method); 42 | 43 | expect(actual).toEqual([3, 4]); 44 | }); 45 | }); 46 | 47 | describe('eachPairSlide', () => { 48 | it('loop through pair of items', () => { 49 | const input = [1, 2, 3, 4]; 50 | const mockMethod = jest.fn(); 51 | 52 | module.eachPairSlide(input, mockMethod); 53 | 54 | expect(mockMethod.mock.calls[0]).toEqual([1, 2, 0]); 55 | expect(mockMethod.mock.calls[1]).toEqual([2, 3, 1]); 56 | expect(mockMethod.mock.calls[2]).toEqual([3, 4, 2]); 57 | expect(mockMethod.mock.calls[3]).toEqual([4, undefined, 3]); 58 | }); 59 | }); 60 | 61 | describe('mapPairSlide', () => { 62 | it('loop through pair of items', () => { 63 | const input = [1, 2, 3, 4]; 64 | const method = (a, b, index) => _.sum([a, b, index]); 65 | 66 | const actual = module.mapPairSlide(input, method); 67 | 68 | expect(actual).toEqual([3, 6, 9, 7]); 69 | }); 70 | 71 | it('calls it once if array is empty', () => { 72 | const input = []; 73 | const method = jest.fn(); 74 | 75 | module.mapPairSlide(input, method); 76 | 77 | expect(method).toHaveBeenCalledTimes(1); 78 | }); 79 | }); 80 | }); 81 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "talksearch-scraper", 3 | "version": "1.0.0", 4 | "description": "Commandline tool to index YouTube videos into Algolia", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "./scripts/test", 8 | "test:watch": "./scripts/test-watch", 9 | "lint": "./scripts/lint", 10 | "index": "babel-node ./scripts/index.js", 11 | "index:cache": "READ_FROM_CACHE=1 babel-node ./scripts/index.js", 12 | "index:logs": "WRITE_RESPONSE_LOGS=1 babel-node ./scripts/index.js", 13 | "new:config": "babel-node ./scripts/new-config.js", 14 | "thumbnails": "./scripts/thumbnails i-p9lWIhcLQ", 15 | "precommit": "yarn lint", 16 | "prepush": "yarn test" 17 | }, 18 | "repository": { 19 | "type": "git", 20 | "url": "git+https://github.com/algolia/talksearch-scraper.git" 21 | }, 22 | "keywords": [], 23 | "author": "", 24 | "license": "ISC", 25 | "bugs": { 26 | "url": "https://github.com/algolia/talksearch-scraper/issues" 27 | }, 28 | "homepage": "https://github.com/algolia/talksearch-scraper#readme", 29 | "dependencies": { 30 | "algoliasearch": "3.32.0", 31 | "axios": "0.17.1", 32 | "bluebird": "3.5.3", 33 | "chalk": "2.4.2", 34 | "cheerio": "1.0.0-rc.2", 35 | "dayjs": "1.8.0", 36 | "dotenv": "4.0.0", 37 | "glob": "7.1.3", 38 | "json-stable-stringify": "1.0.1", 39 | "lodash": "4.17.11", 40 | "mkdirp": "0.5.1", 41 | "multi-progress": "2.0.0", 42 | "p-all": "1.0.0", 43 | "p-each-series": "1.0.0", 44 | "p-iteration": "1.1.7", 45 | "p-map": "1.2.0", 46 | "parse-iso-duration": "1.0.0", 47 | "pify": "3.0.0", 48 | "query-string": "6.2.0", 49 | "yargs": "11.1.0" 50 | }, 51 | "devDependencies": { 52 | "@babel/cli": "7.2.3", 53 | "@babel/core": "7.2.2", 54 | "@babel/node": "7.2.2", 55 | "@babel/plugin-proposal-object-rest-spread": "7.2.0", 56 | "@babel/preset-env": "7.2.3", 57 | "@google-cloud/language": "1.2.0", 58 | "algolia-indexing": "0.0.6", 59 | "babel-core": "7.0.0-bridge.0", 60 | "babel-eslint": "8.2.6", 61 | "babel-jest": "23.6.0", 62 | "babel-plugin-dynamic-import-node-sync": "2.0.1", 63 | "eslint": "4.19.1", 64 | "eslint-config-algolia": "13.2.3", 65 | "eslint-config-prettier": "2.10.0", 66 | "eslint-plugin-import": "2.14.0", 67 | "eslint-plugin-jest": "21.27.2", 68 | "eslint-plugin-prettier": "2.7.0", 69 | "husky": "0.14.3", 70 | "inquirer": "6.2.1", 71 | "jest": "23.6.0", 72 | "prettier": "1.15.3", 73 | "slugify": "1.3.4", 74 | "uuid": "3.3.2" 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /configs/__tests__/voice_summit.js: -------------------------------------------------------------------------------- 1 | import config from '../voice_summit.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('voice_summit', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('getting speaker and title', () => { 12 | const input = { 13 | video: { 14 | title: 15 | 'cathy pearl - i can do that, dave- voice assistants have arrived-', 16 | }, 17 | }; 18 | 19 | const actual = current(input); 20 | 21 | expect(actual).toHaveProperty( 22 | 'video.title', 23 | 'I can do that, dave - Voice assistants have arrived' 24 | ); 25 | expect(actual).toHaveProperty('speakers', [{ name: 'Cathy Pearl' }]); 26 | }); 27 | 28 | it('removing keynote from title', () => { 29 | const input = { 30 | video: { 31 | title: 32 | 'david isbitski - keynote- learning to talk again in a voice-first world', 33 | }, 34 | }; 35 | 36 | const actual = current(input); 37 | 38 | expect(actual).toHaveProperty( 39 | 'video.title', 40 | 'Learning to talk again in a voice-first world' 41 | ); 42 | expect(actual).toHaveProperty('speakers', [{ name: 'David Isbitski' }]); 43 | }); 44 | 45 | it('extracting panel speakers', () => { 46 | const input = { 47 | video: { 48 | title: 49 | 'panel- fintech leads the voice-first revolution - chris hopen, aakrit vaish, david heafitz, kountin', 50 | }, 51 | }; 52 | 53 | const actual = current(input); 54 | 55 | expect(actual).toHaveProperty( 56 | 'video.title', 57 | 'Fintech leads the voice-first revolution' 58 | ); 59 | expect(actual).toHaveProperty('speakers', [ 60 | { name: 'Chris Hopen' }, 61 | { name: 'Aakrit Vaish' }, 62 | { name: 'David Heafitz' }, 63 | { name: 'Kountin' }, 64 | ]); 65 | }); 66 | 67 | it('several speakers', () => { 68 | const input = { 69 | video: { 70 | title: 71 | 'j z, noriaki tatsumi, mark dickinson and timmy liu - rich, clever, fast, and famous', 72 | }, 73 | }; 74 | 75 | const actual = current(input); 76 | 77 | expect(actual).toHaveProperty( 78 | 'video.title', 79 | 'Rich, clever, fast, and famous' 80 | ); 81 | expect(actual).toHaveProperty('speakers', [ 82 | { name: 'J Z' }, 83 | { name: 'Noriaki Tatsumi' }, 84 | { name: 'Mark Dickinson' }, 85 | { name: 'Timmy Liu' }, 86 | ]); 87 | }); 88 | }); 89 | }); 90 | -------------------------------------------------------------------------------- /src/utils.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | 3 | const module = { 4 | /** 5 | * Call methodName on each pair of elements in the collection. Last argument 6 | * will be the index in the loop. 7 | * Example: 8 | * eachPair([1, 2, 3], fn()) will call fn(1, 2, 0), fn(3, undefined, 1); 9 | * 10 | * @param {Array} collection The list of elements 11 | * @param {Function} method The method to call on each pair 12 | * @returns {void} 13 | **/ 14 | eachPair(collection, method) { 15 | let index = 0; 16 | _.each(collection, (item, i) => { 17 | if (i % 2 === 1) { 18 | return; 19 | } 20 | method(collection[i], collection[i + 1], index++); 21 | }); 22 | }, 23 | 24 | /** 25 | * Map methodName on each pair of elements in the collection. 26 | * Example: 27 | * eachPair([1, 2, 3], fn()) will map fn(1, 2), fn(3, undefined); 28 | * 29 | * @param {Array} collection The list of elements 30 | * @param {Function} method The method to map on each pair 31 | * @returns {Array} An array of each pair of element passed through method 32 | **/ 33 | mapPair(collection, method) { 34 | let index = 0; 35 | const results = []; 36 | _.each(collection, (item, i) => { 37 | if (i % 2 === 1) { 38 | return; 39 | } 40 | results.push(method(collection[i], collection[i + 1], index++)); 41 | }); 42 | return results; 43 | }, 44 | 45 | /** 46 | * Call methodName on each pair of elements in the collection. This will slide 47 | * through all elements 48 | * Example: 49 | * eachPairSlide([1, 2, 3], fn()) will call fn(1, 2), fn(2, 3) and fn(3, undefined); 50 | * 51 | * @param {Array} collection The list of elements 52 | * @param {Function} method The method to call on each pair 53 | * @returns {void} 54 | **/ 55 | eachPairSlide(collection, method) { 56 | _.each(collection, (item, i) => { 57 | method(collection[i], collection[i + 1], i); 58 | }); 59 | }, 60 | 61 | /** 62 | * Map methodName on each pair of elements in the collection. This will slide 63 | * through all elements 64 | * Example: 65 | * mapPairSlide([1, 2, 3], fn()) will return fn(1, 2), fn(2, 3) and fn(3, undefined); 66 | * 67 | * @param {Array} collection The list of elements 68 | * @param {Function} method The method to call on each pair 69 | * @returns {Array} An array of each pair of element passed through method 70 | **/ 71 | mapPairSlide(collection, method) { 72 | if (_.isEmpty(collection)) { 73 | return [method()]; 74 | } 75 | return _.map(collection, (item, i) => 76 | method(collection[i], collection[i + 1], i) 77 | ); 78 | }, 79 | }; 80 | 81 | export default _.bindAll(module, _.functions(module)); 82 | -------------------------------------------------------------------------------- /configs/__tests__/criticalrole.js: -------------------------------------------------------------------------------- 1 | import config from '../criticalrole.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('criticalrole', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | describe('The Mighty Nein', () => { 12 | it('should get data from Campaign 2', () => { 13 | const input = { 14 | video: { 15 | title: 16 | 'The Stalking Nightmare | Critical Role | Campaign 2, Episode 29', 17 | }, 18 | }; 19 | 20 | const actual = current(input); 21 | 22 | expect(actual).toHaveProperty('video.title', 'The Stalking Nightmare'); 23 | expect(actual).toHaveProperty('video.campaignNumber', 2); 24 | expect(actual).toHaveProperty('video.episodeNumber', 29); 25 | }); 26 | }); 27 | 28 | describe('Vox Machina', () => { 29 | it('should get data from Campaign 1', () => { 30 | const input = { 31 | video: { 32 | title: "Yug'voril Uncovered - Critical Role RPG Show: Episode 9", 33 | }, 34 | }; 35 | 36 | const actual = current(input); 37 | 38 | expect(actual).toHaveProperty('video.title', "Yug'voril Uncovered"); 39 | expect(actual).toHaveProperty('video.campaignNumber', 1); 40 | expect(actual).toHaveProperty('video.episodeNumber', 9); 41 | }); 42 | 43 | it('works even when there is no colon', () => { 44 | const input = { 45 | video: { 46 | title: 'The Path to Whitestone | Critical Role RPG Show Episode 27', 47 | }, 48 | }; 49 | 50 | const actual = current(input); 51 | 52 | expect(actual).toHaveProperty('video.title', 'The Path to Whitestone'); 53 | expect(actual).toHaveProperty('video.campaignNumber', 1); 54 | expect(actual).toHaveProperty('video.episodeNumber', 27); 55 | }); 56 | 57 | it('works with guest star', () => { 58 | const input = { 59 | video: { 60 | title: 61 | 'Cindergrove Revisited | Critical Role RPG Show Episode 46 w/ CHRIS HARDWICK', 62 | }, 63 | }; 64 | 65 | const actual = current(input); 66 | 67 | expect(actual).toHaveProperty('video.title', 'Cindergrove Revisited'); 68 | expect(actual).toHaveProperty('video.campaignNumber', 1); 69 | expect(actual).toHaveProperty('video.episodeNumber', 46); 70 | }); 71 | 72 | it('works with multiparts', () => { 73 | const input = { 74 | video: { 75 | title: 'Reunions | Critical Role RPG Show Episode 33, pt. 2', 76 | }, 77 | }; 78 | 79 | const actual = current(input); 80 | 81 | expect(actual).toHaveProperty('video.title', 'Reunions, part 2'); 82 | expect(actual).toHaveProperty('video.campaignNumber', 1); 83 | expect(actual).toHaveProperty('video.episodeNumber', 33); 84 | }); 85 | }); 86 | }); 87 | }); 88 | -------------------------------------------------------------------------------- /configs/__tests__/laracon.js: -------------------------------------------------------------------------------- 1 | import config from '../laracon.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('laracon', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('playlist name', () => { 12 | const input = { 13 | playlist: { 14 | title: 'Laracon EU 2014 - Full Playlist', 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toHaveProperty('conference.year', 'EU 2014'); 21 | }); 22 | 23 | describe('2013', () => { 24 | it('speaker and title', () => { 25 | const input = { 26 | playlist: { 27 | title: 'Laracon EU 2013 - Full Playlist', 28 | }, 29 | video: { 30 | title: 'Ben Corlett - Bridging the Gap', 31 | }, 32 | }; 33 | 34 | const actual = current(input); 35 | 36 | expect(actual).toHaveProperty('video.title', 'Bridging the Gap'); 37 | expect(actual).toHaveProperty('speakers', [{ name: 'Ben Corlett' }]); 38 | }); 39 | }); 40 | 41 | describe('2014', () => { 42 | it('speaker and title', () => { 43 | const input = { 44 | playlist: { 45 | title: 'Laracon EU 2014 - Full Playlist', 46 | }, 47 | video: { 48 | title: 49 | "Ross Tuck - Things I Believe Now That I'm Old at Laracon EU 2014", 50 | }, 51 | }; 52 | 53 | const actual = current(input); 54 | 55 | expect(actual).toHaveProperty( 56 | 'video.title', 57 | "Things I Believe Now That I'm Old" 58 | ); 59 | expect(actual).toHaveProperty('speakers', [{ name: 'Ross Tuck' }]); 60 | }); 61 | }); 62 | 63 | describe('2015', () => { 64 | it('speaker and title', () => { 65 | const input = { 66 | playlist: { 67 | title: 'Laracon EU 2015 - Full Playlist', 68 | }, 69 | video: { 70 | title: 'The Tao of Laravel - Taylor Otwell - Laracon EU 2015', 71 | }, 72 | }; 73 | 74 | const actual = current(input); 75 | 76 | expect(actual).toHaveProperty('video.title', 'The Tao of Laravel'); 77 | expect(actual).toHaveProperty('speakers', [{ name: 'Taylor Otwell' }]); 78 | }); 79 | }); 80 | 81 | describe('2016-2017', () => { 82 | it('speaker and title', () => { 83 | const input = { 84 | playlist: { 85 | title: 'Laracon EU 2017 - Full Playlist', 86 | }, 87 | video: { 88 | title: 'Christopher Pitt - Transforming PHP - Laracon EU 2017', 89 | }, 90 | }; 91 | 92 | const actual = current(input); 93 | 94 | expect(actual).toHaveProperty('video.title', 'Transforming PHP'); 95 | expect(actual).toHaveProperty('speakers', [ 96 | { name: 'Christopher Pitt' }, 97 | ]); 98 | }); 99 | }); 100 | }); 101 | }); 102 | -------------------------------------------------------------------------------- /src/progress.js: -------------------------------------------------------------------------------- 1 | import chalk from 'chalk'; 2 | import _ from 'lodash'; 3 | import pulse from './pulse'; 4 | import globals from './globals'; 5 | import MultiProgressBar from 'multi-progress'; 6 | const progressBars = new MultiProgressBar(); 7 | const allBars = {}; 8 | let warnings = []; 9 | 10 | function newBar(id, color, max) { 11 | const name = chalk[color](id); 12 | const bar = progressBars.newBar(`[${name}] [:bar] :current/:total`, { 13 | width: 90, 14 | total: max, 15 | }); 16 | bar.tick(0); 17 | allBars[id] = bar; 18 | } 19 | 20 | function updateCursor() { 21 | if (process.stdout.cursorTo) { 22 | process.stdout.cursorTo(0, 10000); 23 | } 24 | } 25 | 26 | const youtube = { 27 | onCrawlingStart(data) { 28 | const name = globals.configName(); 29 | const total = data.playlists.length; 30 | newBar(name, 'blue', total); 31 | }, 32 | onCrawlingEnd() { 33 | updateCursor(); 34 | }, 35 | onVideos(data) { 36 | const videos = data.videos; 37 | console.info(`${videos.length} videos found`); 38 | }, 39 | onPlaylistStart(data) { 40 | const playlistId = data.playlistId; 41 | const totalVideoCount = data.totalVideoCount; 42 | newBar(playlistId, 'green', totalVideoCount); 43 | }, 44 | onPlaylistChunk(data) { 45 | const playlistId = data.playlistId; 46 | const chunkVideoCount = data.chunkVideoCount; 47 | allBars[playlistId].tick(chunkVideoCount); 48 | }, 49 | onPlaylistEnd() { 50 | const name = globals.configName(); 51 | allBars[name].tick(); 52 | }, 53 | }; 54 | 55 | const language = { 56 | onEnrichStart(data) { 57 | const progressName = chalk.cyan('Enriching'); 58 | const chunkCount = data.videoCount; 59 | allBars.language = progressBars.newBar( 60 | `[${progressName}] [:bar] :current/:total`, 61 | { 62 | total: chunkCount, 63 | width: 70, 64 | } 65 | ); 66 | }, 67 | onEnrichChunk() { 68 | allBars.language.tick(); 69 | }, 70 | onEnrichEnd() { 71 | updateCursor(); 72 | }, 73 | }; 74 | 75 | const generic = { 76 | onError(error, title) { 77 | console.info(chalk.red(title)); 78 | console.error(error); 79 | }, 80 | onWarning(title, details) { 81 | warnings.push({ title, details }); 82 | }, 83 | }; 84 | 85 | function displayWarnings() { 86 | updateCursor(); 87 | 88 | const groupedWarnings = _.groupBy(warnings, 'title'); 89 | _.each(groupedWarnings, (typedWarnings, title) => { 90 | console.info(chalk.red(title)); 91 | 92 | const displayedResult = _.flatten(_.map(typedWarnings, 'details')).join( 93 | '\n' 94 | ); 95 | 96 | console.info(chalk.yellow(displayedResult)); 97 | }); 98 | 99 | warnings = []; 100 | } 101 | 102 | pulse.on('error', generic.onError); 103 | pulse.on('warning', generic.onWarning); 104 | 105 | pulse.on('youtube:crawling:start', youtube.onCrawlingStart); 106 | pulse.on('youtube:crawling:end', youtube.onCrawlingEnd); 107 | pulse.on('youtube:videos', youtube.onVideos); 108 | pulse.on('playlist:start', youtube.onPlaylistStart); 109 | pulse.on('playlist:chunk', youtube.onPlaylistChunk); 110 | pulse.on('playlist:end', youtube.onPlaylistEnd); 111 | 112 | pulse.on('enrich:start', language.onEnrichStart); 113 | pulse.on('enrich:chunk', language.onEnrichChunk); 114 | pulse.on('enrich:end', language.onEnrichEnd); 115 | 116 | const progress = { 117 | displayWarnings, 118 | }; 119 | 120 | export default progress; 121 | -------------------------------------------------------------------------------- /configs/__tests__/writethedocs.js: -------------------------------------------------------------------------------- 1 | import config from '../writethedocs.js'; 2 | import helper from '../config-helper.js'; 3 | 4 | describe('writethedocs', () => { 5 | describe('transformData', () => { 6 | let current; 7 | beforeEach(() => { 8 | current = input => config.transformData(input, helper); 9 | }); 10 | 11 | it('Conference Year', () => { 12 | const input = { 13 | playlist: { 14 | title: 'Write the Docs Europe 2015', 15 | }, 16 | }; 17 | 18 | const actual = current(input); 19 | 20 | expect(actual).toHaveProperty('conference.year', 'Europe 2015'); 21 | }); 22 | 23 | describe('Portland 2018', () => { 24 | it('getting speaker and title', () => { 25 | const input = { 26 | playlist: { 27 | title: 'Write the Docs Portland 2018', 28 | }, 29 | video: { 30 | title: 31 | 'Building Empathy-Driven Developer Documentation - Kat King - Write the Docs Portland 2018', 32 | }, 33 | }; 34 | 35 | const actual = current(input); 36 | 37 | expect(actual).toHaveProperty( 38 | 'video.title', 39 | 'Building Empathy-Driven Developer Documentation' 40 | ); 41 | expect(actual).toHaveProperty('speakers', [{ name: 'Kat King' }]); 42 | }); 43 | }); 44 | describe('Portland 2017', () => { 45 | it('getting speaker and title', () => { 46 | const input = { 47 | playlist: { 48 | title: 'Write the Docs Portland 2017', 49 | }, 50 | video: { 51 | title: 52 | 'Write the Docs Portland 2017: Building navigation for your doc site: 5 best practices by Tom Johnson', 53 | }, 54 | }; 55 | 56 | const actual = current(input); 57 | 58 | expect(actual).toHaveProperty( 59 | 'video.title', 60 | 'Building navigation for your doc site: 5 best practices' 61 | ); 62 | expect(actual).toHaveProperty('speakers', [{ name: 'Tom Johnson' }]); 63 | }); 64 | }); 65 | 66 | describe('Prage 2017', () => { 67 | it('getting speaker and title', () => { 68 | const input = { 69 | playlist: { 70 | title: 'Write the Docs Prague 2017', 71 | }, 72 | video: { 73 | title: 74 | 'Write the Docs Prague 2017: Telling a Great Story on GitHub by Lauri Apple', 75 | }, 76 | }; 77 | 78 | const actual = current(input); 79 | 80 | expect(actual).toHaveProperty( 81 | 'video.title', 82 | 'Telling a Great Story on GitHub' 83 | ); 84 | expect(actual).toHaveProperty('speakers', [{ name: 'Lauri Apple' }]); 85 | }); 86 | }); 87 | 88 | describe('2014 - 2016', () => { 89 | it('Lightning talks', () => { 90 | const input = { 91 | video: { 92 | title: 'Lightning talks - Day 2', 93 | }, 94 | }; 95 | 96 | const actual = current(input); 97 | 98 | expect(actual).toHaveProperty('video.title', 'Lightning talks - Day 2'); 99 | }); 100 | it('several speakers', () => { 101 | const input = { 102 | video: { 103 | title: 'Florian Scholz & Jean-Yves Perrier - Gardening Open Docs', 104 | }, 105 | }; 106 | 107 | const actual = current(input); 108 | 109 | expect(actual).toHaveProperty('video.title', 'Gardening Open Docs'); 110 | expect(actual).toHaveProperty('speakers', [ 111 | { name: 'Florian Scholz' }, 112 | { name: 'Jean-Yves Perrier' }, 113 | ]); 114 | }); 115 | it('getting speaker and title', () => { 116 | const input = { 117 | video: { 118 | title: 'Tom Christie - Designing MkDocs', 119 | }, 120 | }; 121 | 122 | const actual = current(input); 123 | 124 | expect(actual).toHaveProperty('video.title', 'Designing MkDocs'); 125 | expect(actual).toHaveProperty('speakers', [{ name: 'Tom Christie' }]); 126 | }); 127 | }); 128 | }); 129 | }); 130 | -------------------------------------------------------------------------------- /scripts/new-config.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import _ from 'lodash'; 3 | import chalk from 'chalk'; 4 | import inquirer from 'inquirer'; 5 | import slugify from 'slugify'; 6 | import queryString from 'query-string'; 7 | import pMap from 'p-map'; 8 | import youtube from '../src/youtube.js'; 9 | import prettier from 'prettier'; 10 | import algoliasearch from 'algoliasearch'; 11 | import globals from '../src/globals'; 12 | 13 | const newConfig = { 14 | // Open a prompt and wait for the answer 15 | async prompt(question) { 16 | const answer = await inquirer.prompt([{ name: 'key', message: question }]); 17 | return answer.key; 18 | }, 19 | 20 | // Ask for the config name 21 | async getConfigName() { 22 | let configName = await this.prompt('What is the config name?'); 23 | configName = _.lowerCase(slugify(configName, '_')); 24 | if (fs.existsSync(`./configs/${configName}.js`)) { 25 | console.info( 26 | chalk.red(`✘ There is already a config named ${configName}`) 27 | ); 28 | return await this.getConfigName(); 29 | } 30 | return configName; 31 | }, 32 | 33 | // Ask for playlist ids 34 | async getPlaylistIds(memo = []) { 35 | let question; 36 | if (_.isEmpty(memo)) { 37 | question = 'Enter a playlist ID or playlist url:'; 38 | } else { 39 | const addenda = chalk.grey('(leave empty if not)'); 40 | question = `Other playlist ID or playlist url? ${addenda}`; 41 | } 42 | 43 | let playlistId = await this.prompt(question); 44 | const potentialQueryString = queryString.parse(playlistId); 45 | if (potentialQueryString.list) { 46 | playlistId = potentialQueryString.list; 47 | } 48 | 49 | if (!playlistId) { 50 | return _.uniq(memo); 51 | } 52 | memo.push(playlistId); 53 | return await this.getPlaylistIds(memo); 54 | }, 55 | 56 | // Get the new config file content 57 | async getConfigFileContent(configName, playlistIds) { 58 | // Get name of playlists along with ids 59 | const playlists = []; 60 | await pMap(playlistIds, async playlistId => { 61 | const playlistData = await youtube.getPlaylistData(playlistId); 62 | playlists.push({ id: playlistId, name: playlistData.title }); 63 | }); 64 | const playlistReplace = _.map( 65 | _.sortBy(playlists, ['name']), 66 | playlist => `'${playlist.id}', // ${playlist.name}` 67 | ).join('\n'); 68 | 69 | // Read the sample and update the content 70 | const sampleContent = fs.readFileSync( 71 | './configs/config.sample.js', 72 | 'utf-8' 73 | ); 74 | let newContent = _.replace(sampleContent, '{{indexName}}', configName); 75 | newContent = _.replace( 76 | newContent, 77 | "'{{playlistIds}}'", 78 | `\n${playlistReplace}\n` 79 | ); 80 | newContent = prettier.format(newContent, { 81 | singleQuote: true, 82 | trailingComma: 'all', 83 | }); 84 | 85 | return newContent; 86 | }, 87 | 88 | async createApiKey(configName) { 89 | const client = algoliasearch( 90 | globals.algoliaAppId(), 91 | globals.algoliaApiKey() 92 | ); 93 | const apiKey = await client.addApiKey(['search'], { 94 | indexes: [configName], 95 | description: configName, 96 | }); 97 | 98 | return apiKey.key; 99 | }, 100 | }; 101 | 102 | (async () => { 103 | try { 104 | const configName = await newConfig.getConfigName(); 105 | const playlistIds = await newConfig.getPlaylistIds(); 106 | 107 | const configContent = await newConfig.getConfigFileContent( 108 | configName, 109 | playlistIds 110 | ); 111 | 112 | // Write to disk 113 | fs.writeFileSync(`./configs/${configName}.js`, configContent); 114 | console.info( 115 | `${chalk.green('✔')} Config for ${chalk.green(configName)} saved.` 116 | ); 117 | 118 | // Create the API key 119 | const apiKey = await newConfig.createApiKey(configName); 120 | console.info(`indexName: ${chalk.blue(configName)}`); 121 | console.info(`apiKey: ${chalk.green(apiKey)}`); 122 | } catch (err) { 123 | console.info(chalk.red('✘ ERROR:')); 124 | console.info(err.message); 125 | console.info(err); 126 | process.exit(1); // eslint-disable-line no-process-exit 127 | } 128 | })(); 129 | -------------------------------------------------------------------------------- /configs/config-helper.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | import dayjs from 'dayjs'; 3 | 4 | /** 5 | * Returns an object with (deep) keys matching the {named} patterns. 6 | * Example: 7 | * helper.match('John Doe - 2018', '{video.title} - {conference.year}') 8 | * => { video: { title: 'John Doe' }, conference: { year: '2018' } } 9 | * @param {String} input Initial string to parse 10 | * @param {String} pattern Pattern string 11 | * @returns {Object} Object containing matching keys 12 | * Note: The special {_} pattern can be used to discard a match from the result. 13 | **/ 14 | function match(input, pattern) { 15 | // Identifying each group of {named} patterns 16 | const patternRegexp = /{.*?}/g; 17 | 18 | // ECMAScript does not *yet* support named matches expression, so we create 19 | // our own mapping of names to extract and position in the matches 20 | const namedGroups = _.map(pattern.match(patternRegexp), needle => 21 | needle.replace(/{|}/g, '') 22 | ); 23 | 24 | // Convert the simple pattern (using {}), to a real regexp 25 | let stringRegexp = pattern.replace(patternRegexp, '(.*)'); 26 | // Also escape characters in the pattern that should not be treated as regexp 27 | stringRegexp = _.replace(stringRegexp, /\|/g, '\\|'); 28 | const regexp = new RegExp(stringRegexp); 29 | 30 | // Linking each match to its named value 31 | const matches = input.match(regexp); 32 | 33 | // No match found 34 | if (!matches) { 35 | return false; 36 | } 37 | 38 | const result = {}; 39 | _.each(namedGroups, (namedGroup, index) => { 40 | result[namedGroup] = matches[index + 1]; 41 | }); 42 | // Discard the _ ignore pattern 43 | delete result._; 44 | 45 | return result; 46 | } 47 | 48 | /** 49 | * Match patterns on a specific key of the record and enrich other keys with 50 | * what is extracted. 51 | * Example: 52 | * const input = { video: { title: 'John Doe - 2018' } } 53 | * helper.enrich(input, 'video.title', '{_speakers_} - {conference.year}') 54 | * => { 55 | * video: { 56 | * title: 'John Doe - 2018' 57 | * }, 58 | * speakers: [{ 59 | * name: 'John Doe' 60 | * }], 61 | * conference: { 62 | * year: '2018' 63 | * } 64 | * } 65 | * @param {Object} record Initial object to enrich 66 | * @param {String} path Path of the key to read 67 | * @param {String} pattern Pattern to use for extracting 68 | * @returns {Object} Original object, enriched with extracted patterns 69 | **/ 70 | function enrich(record, path, pattern) { 71 | const newRecord = record; 72 | const input = _.get(record, path); 73 | if (!input) { 74 | return record; 75 | } 76 | 77 | const matches = match(input, pattern); 78 | 79 | // Update the keys with the new values 80 | _.each(matches, (value, key) => { 81 | let newKey = key; 82 | let newValue = value; 83 | if (key === '_speakers_') { 84 | const allSpeakers = split(value, '&', ','); 85 | newValue = _.map(allSpeakers, speakerName => ({ name: speakerName })); 86 | newKey = 'speakers'; 87 | } 88 | _.set(newRecord, newKey, newValue); 89 | }); 90 | 91 | return newRecord; 92 | } 93 | 94 | /** 95 | * Splits a string according to multiple separators 96 | * Example: 97 | * helper.trim('foo / bar | baz', '/', '|'); 98 | * => ['foo', 'bar', 'baz'] 99 | * @param {String} input Initial string to split 100 | * @param {String} ...separators List of separators to use 101 | * @returns {Array} Array of elements 102 | * Note that this will trim all elements 103 | **/ 104 | function split(input, ...separators) { 105 | let results = [input]; 106 | 107 | _.each(separators, separator => { 108 | const splitResults = _.map(results, item => item.split(separator)); 109 | const flattenResults = _.flatten(splitResults); 110 | const trimmedResults = _.compact(_.map(flattenResults, _.trim)); 111 | 112 | results = trimmedResults; 113 | }); 114 | 115 | return results; 116 | } 117 | 118 | /** 119 | * Remove all unwanted elements from a specific key of the element 120 | * Example: 121 | * const input = { video: { title: 'foobar // 2018 conference' } } 122 | * helper.trimKey(input, 'video.title', '// 2018 conference'); 123 | * @param {String} rawRecord Initial object to modify 124 | * @param {String} path Path of the key to update 125 | * @param {String} ...trimList List of strings to remove 126 | * @returns {Object} Modified object 127 | * Note that this will trim the final value 128 | **/ 129 | function trimKey(rawRecord, path, ...trimList) { 130 | const record = rawRecord; 131 | let input = _.get(record, path); 132 | if (!input) { 133 | return record; 134 | } 135 | 136 | _.each(trimList, toTrim => { 137 | input = _.replace(input, toTrim, ''); 138 | }); 139 | 140 | _.set(record, path, _.trim(input)); 141 | 142 | return record; 143 | } 144 | 145 | /** 146 | * Returns a year from a timestamp 147 | * @param {Number} timestamp Unix timestamp 148 | * @returns {Number} YYYY-formatted year 149 | **/ 150 | function year(timestamp) { 151 | return _.parseInt(dayjs(timestamp * 1000).format('YYYY')); 152 | } 153 | 154 | const ConfigHelper = { 155 | match, 156 | enrich, 157 | split, 158 | trimKey, 159 | year, 160 | }; 161 | 162 | export default ConfigHelper; 163 | -------------------------------------------------------------------------------- /src/transformer.js: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | import dayjs from 'dayjs'; 3 | import globals from './globals'; 4 | import configHelper from '../configs/config-helper'; 5 | import language from '../src/language'; 6 | 7 | const module = { 8 | /** 9 | * Compute a value for ranking based on the various popularity metrics. 10 | * So far, it's an easy sum of all interactions (like/dislike/views/comments, 11 | * etc). 12 | * @param {Object} videoData Object of all interactions 13 | * @return {Number} Popularity score 14 | **/ 15 | getPopularityScore(videoData) { 16 | if (!_.has(videoData, 'popularity')) { 17 | return 0; 18 | } 19 | return _.sum(_.values(_.get(videoData, 'popularity'))); 20 | }, 21 | 22 | /** 23 | * Return an object representation of the date, with timestamp values capped at 24 | * the start of the day, month and year. This will be used to limit ties in the 25 | * custom ranking 26 | * @param {Number} timestamp The exact timestamp 27 | * @returns {Object} An object of capped timestamps 28 | **/ 29 | getBucketedDate(timestamp) { 30 | const date = dayjs(timestamp * 1000); 31 | const yearGranularity = date.startOf('year'); 32 | const monthGranularity = date.startOf('month'); 33 | const dayGranularity = date.startOf('day'); 34 | 35 | return { 36 | year: yearGranularity.unix(), 37 | month: monthGranularity.unix(), 38 | day: dayGranularity.unix(), 39 | timestamp: date.unix(), 40 | }; 41 | }, 42 | 43 | /** 44 | * Return a url to go to the specific time in the video 45 | * @param {String} videoId Video id 46 | * @param {Number} start Start time, in seconds 47 | * @returns {String} Url pointing to the specific time in the video 48 | **/ 49 | getCaptionUrl(videoId, start) { 50 | let url = `https://www.youtube.com/watch?v=${videoId}`; 51 | if (start > 0) { 52 | url = `${url}&t=${start}s`; 53 | } 54 | return url; 55 | }, 56 | 57 | /** 58 | * Return an object representing a caption 59 | * @param {String} userCaption Caption string 60 | * @param {Number} position Position index in the list of all captions 61 | * @param {String} videoId The YouUbe videoId 62 | * @returns {Object} An object representing a caption 63 | **/ 64 | getCaptionDetails(userCaption, position, videoId) { 65 | let caption = userCaption; 66 | // Always adding a caption, even if empty, it makes the front-end logic easier 67 | // to handle 68 | if (!caption) { 69 | caption = { 70 | content: null, 71 | duration: 0, 72 | start: 0, 73 | }; 74 | } 75 | 76 | // Round start to exact second because we can't jump to more precise than 77 | // that 78 | const start = _.floor(caption.start); 79 | const url = this.getCaptionUrl(videoId, start); 80 | 81 | return { 82 | ...caption, 83 | position, 84 | start, 85 | url, 86 | }; 87 | }, 88 | 89 | /** 90 | * Returns an array of record from a video 91 | * @param {Object} video The video object 92 | * @returns {Array} An array of all records for this video, one per caption 93 | **/ 94 | recordsFromVideo(video) { 95 | // Enhanced video data 96 | const videoDetails = { ...video.video }; 97 | _.set( 98 | videoDetails, 99 | 'popularity.score', 100 | this.getPopularityScore(videoDetails) 101 | ); 102 | _.set( 103 | videoDetails, 104 | 'publishedDate', 105 | this.getBucketedDate(videoDetails.publishedDate) 106 | ); 107 | 108 | // Base record metadata to add to all records 109 | let baseRecord = { 110 | video: videoDetails, 111 | playlist: video.playlist, 112 | channel: video.channel, 113 | speakers: video.speakers, 114 | conference: video.conference, 115 | }; 116 | 117 | // Config specific updates 118 | const config = globals.config(); 119 | if (_.get(config, 'transformData')) { 120 | baseRecord = config.transformData(baseRecord, configHelper); 121 | } 122 | 123 | // One record per caption, with a minimum of 1 even if no captions 124 | let captions = _.get(video, 'captions'); 125 | if (_.isEmpty(captions)) { 126 | captions = [undefined]; 127 | } 128 | 129 | return _.map(captions, (caption, position) => { 130 | const videoId = baseRecord.video.id; 131 | const captionDetails = this.getCaptionDetails(caption, position, videoId); 132 | const record = { 133 | ...baseRecord, 134 | caption: captionDetails, 135 | }; 136 | 137 | return record; 138 | }); 139 | }, 140 | 141 | /** 142 | * Guess the conference year based on the playlist name 143 | * @param {Object} video The video object 144 | * @returns {Number} The conference year 145 | **/ 146 | guessConferenceYear(video) { 147 | const playlistTitle = _.get(video, 'playlist.title', null); 148 | if (!playlistTitle) { 149 | return null; 150 | } 151 | const matches = playlistTitle.match(/[0-9]{4}/); 152 | if (!matches) { 153 | return null; 154 | } 155 | return _.parseInt(matches); 156 | }, 157 | 158 | /** 159 | * Enrich the raw video data as extracted from YouTube with some guess about 160 | * other fields 161 | * @param {Array} inputVideos List of raw videos 162 | * @returns {Array} The enriched list of videos 163 | * Note that this will create .conference and .speakers keys to the object 164 | **/ 165 | async enrichVideos(inputVideos) { 166 | // Extract speakers from text analysis of the title 167 | let videos = await language.enrichVideos(inputVideos); 168 | 169 | // Guessing conference year 170 | videos = _.map(videos, video => ({ 171 | ...video, 172 | conference: { 173 | year: this.guessConferenceYear(video), 174 | }, 175 | })); 176 | 177 | return videos; 178 | }, 179 | 180 | async run(inputVideos) { 181 | // Enrich videos 182 | const videos = await this.enrichVideos(inputVideos); 183 | 184 | // Convert videos to records 185 | const records = _.flatten(_.map(videos, this.recordsFromVideo)); 186 | 187 | return records; 188 | }, 189 | }; 190 | 191 | export default _.bindAll(module, _.functions(module)); 192 | -------------------------------------------------------------------------------- /configs/__tests__/config-helper.js: -------------------------------------------------------------------------------- 1 | import module from '../config-helper'; 2 | 3 | let current; 4 | 5 | describe('configHelper', () => { 6 | describe('match', () => { 7 | beforeEach(() => { 8 | current = module.match; 9 | }); 10 | 11 | it('extracts one simple pattern', () => { 12 | const input = 'Foo - Bar'; 13 | const pattern = '{name} - Bar'; 14 | 15 | const actual = current(input, pattern); 16 | 17 | expect(actual).toHaveProperty('name', 'Foo'); 18 | }); 19 | 20 | it('extracts several simple patterns', () => { 21 | const input = 'Foo - Bar'; 22 | const pattern = '{name} - {title}'; 23 | 24 | const actual = current(input, pattern); 25 | 26 | expect(actual).toHaveProperty('name', 'Foo'); 27 | expect(actual).toHaveProperty('title', 'Bar'); 28 | }); 29 | 30 | it('extract multiple words', () => { 31 | const input = 'Foo - A CSS Search Engine'; 32 | const pattern = 'Foo - {title}'; 33 | 34 | const actual = current(input, pattern); 35 | 36 | expect(actual).toHaveProperty('title', 'A CSS Search Engine'); 37 | }); 38 | 39 | it('can match deep properties', () => { 40 | const input = 'Foo - A CSS Search Engine'; 41 | const pattern = 'Foo - {video.title}'; 42 | 43 | const actual = current(input, pattern); 44 | 45 | expect(actual['video.title']).toEqual('A CSS Search Engine'); 46 | }); 47 | 48 | it('allows the _ ignore wilcard', () => { 49 | const input = 'Foo - Bar'; 50 | const pattern = '{_} - {name}'; 51 | 52 | const actual = current(input, pattern); 53 | 54 | expect(actual).toHaveProperty('name', 'Bar'); 55 | expect(actual).not.toHaveProperty('_'); 56 | }); 57 | 58 | it('returns false if no match', () => { 59 | const input = 'Foo // Bar'; 60 | const pattern = '{_} - {name}'; 61 | 62 | const actual = current(input, pattern); 63 | 64 | expect(actual).toEqual(false); 65 | }); 66 | 67 | it('extract patterns when pipes in the input', () => { 68 | const input = 'Foo | XXX | Bar'; 69 | const pattern = 'Foo | XXX | {name}'; 70 | 71 | const actual = current(input, pattern); 72 | 73 | expect(actual).toHaveProperty('name', 'Bar'); 74 | }); 75 | }); 76 | 77 | describe('enrich', () => { 78 | beforeEach(() => { 79 | current = module.enrich; 80 | }); 81 | 82 | it('updates the record at the specified path', () => { 83 | const record = { 84 | title: 'Tim Carry - A CSS Search Engine', 85 | }; 86 | const path = 'title'; 87 | const pattern = '{authorName} - {talkName}'; 88 | 89 | const actual = current(record, path, pattern); 90 | 91 | expect(actual).toHaveProperty('authorName', 'Tim Carry'); 92 | expect(actual).toHaveProperty('talkName', 'A CSS Search Engine'); 93 | }); 94 | 95 | it('it updates a record at a deep path', () => { 96 | const record = { 97 | title: 'Tim Carry - A CSS Search Engine', 98 | }; 99 | const path = 'title'; 100 | const pattern = '{author.name} - {video.title}'; 101 | 102 | const actual = current(record, path, pattern); 103 | 104 | expect(actual).toHaveProperty('author.name', 'Tim Carry'); 105 | expect(actual).toHaveProperty('video.title', 'A CSS Search Engine'); 106 | }); 107 | 108 | it('it overwrites the initial key if needed', () => { 109 | const record = { 110 | video: { 111 | title: 'Tim Carry - A CSS Search Engine', 112 | foo: 'bar', 113 | }, 114 | }; 115 | const path = 'video.title'; 116 | const pattern = '{author.name} - {video.title}'; 117 | 118 | const actual = current(record, path, pattern); 119 | 120 | expect(actual).toHaveProperty('author.name', 'Tim Carry'); 121 | expect(actual).toHaveProperty('video.title', 'A CSS Search Engine'); 122 | expect(actual).toHaveProperty('video.foo', 'bar'); 123 | }); 124 | 125 | it('does nothing if cannot find the key', () => { 126 | const record = { 127 | foo: 'bar', 128 | }; 129 | const path = 'bar'; 130 | const pattern = '{author.name} - {video.title}'; 131 | 132 | const actual = current(record, path, pattern); 133 | 134 | expect(actual).toEqual(record); 135 | }); 136 | 137 | it('it converts one speaker to an array with names', () => { 138 | const record = { 139 | video: { 140 | title: 'Tim Carry - bar', 141 | }, 142 | }; 143 | const path = 'video.title'; 144 | const pattern = '{_speakers_} - {video.title}'; 145 | 146 | const actual = current(record, path, pattern); 147 | 148 | expect(actual).toHaveProperty('speakers', [{ name: 'Tim Carry' }]); 149 | }); 150 | 151 | describe('several speakers', () => { 152 | it('seperated by &', () => { 153 | const record = { 154 | video: { 155 | title: 'Tim Carry & Lucas Bonomi - bar', 156 | }, 157 | }; 158 | const path = 'video.title'; 159 | const pattern = '{_speakers_} - {video.title}'; 160 | 161 | const actual = current(record, path, pattern); 162 | 163 | expect(actual).toHaveProperty('speakers', [ 164 | { name: 'Tim Carry' }, 165 | { name: 'Lucas Bonomi' }, 166 | ]); 167 | }); 168 | 169 | it('seperated by ,', () => { 170 | const record = { 171 | video: { 172 | title: 'Tim Carry, Lucas Bonomi - bar', 173 | }, 174 | }; 175 | const path = 'video.title'; 176 | const pattern = '{_speakers_} - {video.title}'; 177 | 178 | const actual = current(record, path, pattern); 179 | 180 | expect(actual).toHaveProperty('speakers', [ 181 | { name: 'Tim Carry' }, 182 | { name: 'Lucas Bonomi' }, 183 | ]); 184 | }); 185 | }); 186 | 187 | it('replace the existing list of speakers', () => { 188 | const record = { 189 | speakers: [{ name: 'foo' }, { name: 'bar' }], 190 | video: { 191 | title: 'Tim Carry - bar', 192 | }, 193 | }; 194 | const path = 'video.title'; 195 | const pattern = '{_speakers_} - {video.title}'; 196 | 197 | const actual = current(record, path, pattern); 198 | 199 | expect(actual).toHaveProperty('speakers', [{ name: 'Tim Carry' }]); 200 | }); 201 | }); 202 | 203 | describe('split', () => { 204 | beforeEach(() => { 205 | current = module.split; 206 | }); 207 | 208 | it('can split with one separator', () => { 209 | const input = 'foo // bar'; 210 | 211 | const actual = current(input, '//'); 212 | 213 | expect(actual).toEqual(['foo', 'bar']); 214 | }); 215 | 216 | it('can split by several seperators', () => { 217 | const input = 'foo // bar | baz'; 218 | 219 | const actual = current(input, '//', '|'); 220 | 221 | expect(actual).toEqual(['foo', 'bar', 'baz']); 222 | }); 223 | 224 | it('removes empty parts', () => { 225 | const input = 'foo // bar |'; 226 | 227 | const actual = current(input, '//', '|'); 228 | 229 | expect(actual).toEqual(['foo', 'bar']); 230 | }); 231 | }); 232 | 233 | describe('trimKey', () => { 234 | beforeEach(() => { 235 | current = module.trimKey; 236 | }); 237 | 238 | it('removes one passed trim element from the key', () => { 239 | const input = { 240 | video: { 241 | title: 'foo Conference', 242 | }, 243 | }; 244 | 245 | const actual = current(input, 'video.title', 'Conference'); 246 | 247 | expect(actual).toHaveProperty('video.title', 'foo'); 248 | }); 249 | 250 | it('removes all passed trim element from the key', () => { 251 | const input = { 252 | video: { 253 | title: 'foo Conference Bar', 254 | }, 255 | }; 256 | 257 | const actual = current(input, 'video.title', 'Conference', 'Bar'); 258 | 259 | expect(actual).toHaveProperty('video.title', 'foo'); 260 | }); 261 | }); 262 | 263 | describe('year', () => { 264 | it('should return the year of a given date', () => { 265 | const input = 1490979292; 266 | 267 | const actual = module.year(input); 268 | 269 | expect(actual).toEqual(2017); 270 | }); 271 | }); 272 | }); 273 | -------------------------------------------------------------------------------- /src/language.js: -------------------------------------------------------------------------------- 1 | import pulse from './pulse'; 2 | import globals from './globals'; 3 | import fileutils from './fileutils'; 4 | import diskLogger from './disk-logger'; 5 | import language from '@google-cloud/language'; 6 | import pMap from 'p-map'; 7 | import uuid from 'uuid/v1'; 8 | import _ from 'lodash'; 9 | let CLIENT; 10 | 11 | const cache = { 12 | STORE: {}, 13 | /** 14 | * Return the internal STORE 15 | * @return {Object} Value of the current cache 16 | * Each key is a videoID, and each value is an array of objects containing an 17 | * .input and .entities keys. 18 | * Note: This is used for tests, to be able to manipulate the private data 19 | **/ 20 | get() { 21 | return this.STORE; 22 | }, 23 | 24 | /** 25 | * Set the internal STORE 26 | * @param {Object} newCache The new value of the cache 27 | * @returns {Void} 28 | * Note: This is used for tests, to be able to manipulate the private data 29 | **/ 30 | set(newCache) { 31 | this.STORE = newCache; 32 | }, 33 | 34 | /** 35 | * Returns the filepath to save the cache to. 36 | * @return {String} Filepath, relative to the config 37 | **/ 38 | filepath() { 39 | return `./cache/${globals.configName()}/language/cache.json`; 40 | }, 41 | 42 | /** 43 | * Read cache file from disk and load it into memory 44 | * @return {Void} 45 | **/ 46 | async grab() { 47 | const cacheFile = this.filepath(); 48 | const cacheContent = await fileutils.readJson(cacheFile); 49 | this.set(cacheContent || {}); 50 | }, 51 | 52 | /** 53 | * Write the local cache object to disk 54 | * @return {Void} 55 | **/ 56 | async release() { 57 | const cacheFile = this.filepath(); 58 | await fileutils.writeJson(cacheFile, this.get()); 59 | }, 60 | 61 | /** 62 | * Read from local cache if we already have entities for this videoId and input 63 | * @param {String} videoId The YouTube videoId 64 | * @param {String} input The input string to analyze 65 | * @return {Array|Boolean} false if no result, the array of entities otherwise 66 | **/ 67 | read(videoId, input) { 68 | const shouldUseCache = globals.readFromCache(); 69 | if (!shouldUseCache) { 70 | return false; 71 | } 72 | 73 | const cacheVideoEntries = _.get(this.get(), videoId); 74 | if (!cacheVideoEntries) { 75 | return false; 76 | } 77 | 78 | return _.get(_.find(cacheVideoEntries, { input }), 'entities', false); 79 | }, 80 | 81 | /** 82 | * Write the entities to the matching input and videoId in the local cache 83 | * @param {String} videoId The id of the video 84 | * @param {String} input The string to analyze 85 | * @param {Array} entities The entities to save 86 | * @return {Boolean} true if saved, false otherwise 87 | * Note: This only saves the value in memory, you still need to call 88 | * releaseCache() to commit it to disk. 89 | **/ 90 | write(videoId, input, entities) { 91 | if (!globals.readFromCache()) { 92 | return false; 93 | } 94 | 95 | const newCache = this.get(); 96 | 97 | if (!newCache[videoId]) { 98 | newCache[videoId] = []; 99 | } 100 | 101 | newCache[videoId].push({ input, entities }); 102 | 103 | this.set(newCache); 104 | 105 | return true; 106 | }, 107 | }; 108 | 109 | const module = { 110 | cache: _.bindAll(cache, _.functions(cache)), 111 | /** 112 | * Return a singleton instance of the Google Language client 113 | * @returns {Object} Instance of LanguageServiceClient 114 | **/ 115 | client() { 116 | if (CLIENT) { 117 | return CLIENT; 118 | } 119 | return (CLIENT = new language.LanguageServiceClient()); 120 | }, 121 | 122 | /** 123 | * Enrich the video with the list of speakers 124 | * @param {Object} video The original video 125 | * @returns {Object} The enriched video 126 | */ 127 | async enrichVideo(video) { 128 | const speakers = await this.getSpeakers(video); 129 | 130 | const newVideo = video; 131 | _.set(newVideo, 'speakers', speakers); 132 | 133 | return newVideo; 134 | }, 135 | 136 | /** 137 | * React to errors returned by the Google API 138 | * @param {Object} err Error thrown 139 | * @param {String} videoId Id of the video 140 | * @returns {Void} 141 | **/ 142 | handleGoogleApiErrors(err, videoId) { 143 | const message = err.message; 144 | // We catch "missing credentials" error. This is a hard error, the whole 145 | // process needs to stop 146 | if (_.includes(message, 'Could not load the default credentials')) { 147 | console.info('Unable to load Google Natural Language API credentials'); 148 | console.info( 149 | 'Make sure you have GOOGLE_APPLICATION_CREDENTIALS set to the path to your google.service-account-file.json file' 150 | ); 151 | console.info( 152 | "If you don't have a service-account.json file, create one on https://console.cloud.google.com/apis/credentials/serviceaccountkey" 153 | ); 154 | this.stopProcess(); 155 | } 156 | 157 | // Other errors will just warn 158 | pulse.emit('warning', message, `https://youtu.be/${videoId}`); 159 | }, 160 | 161 | /** 162 | * Stop the current node process 163 | * @returns {Void} 164 | **/ 165 | stopProcess() { 166 | // eslint-disable-next-line no-process-exit 167 | process.exit(1); 168 | }, 169 | 170 | /** 171 | * Return the list of entities as found by the Google Language API for the given 172 | * input. Results will be read from cache if cache is enabled 173 | * @param {String} videoId The videoId (used as a cache key) 174 | * @param {String} input The sentence to analyze 175 | * @return {Array} An array of entities 176 | **/ 177 | async getEntities(videoId, input) { 178 | const cacheHit = cache.read(videoId, input); 179 | if (cacheHit) { 180 | return cacheHit; 181 | } 182 | 183 | const options = { 184 | content: input, 185 | type: 'PLAIN_TEXT', 186 | }; 187 | 188 | let results; 189 | try { 190 | results = await this.client().analyzeEntities({ document: options }); 191 | } catch (err) { 192 | this.handleGoogleApiErrors(err, videoId); 193 | return []; 194 | } 195 | 196 | const entities = results[0].entities; 197 | 198 | // Save the API result to disk for debug purposes 199 | const logPath = `language/${globals.configName()}/${uuid()}.json`; 200 | const logResults = { input, results }; 201 | diskLogger.write(logPath, logResults); 202 | 203 | // Save to cache as well 204 | cache.write(videoId, input, entities); 205 | 206 | return entities; 207 | }, 208 | 209 | /** 210 | * Return all the speakers extracted from the video 211 | * @param {Object} video The video object 212 | * @return {Array} Array of object containing speaker data 213 | **/ 214 | async getSpeakers(video) { 215 | const videoTitle = _.get(video, 'video.title'); 216 | const videoId = _.get(video, 'video.id'); 217 | const entities = await this.getEntities(videoId, videoTitle); 218 | 219 | let matchingEntities = _.filter(entities, { type: 'PERSON' }); 220 | matchingEntities = _.filter(matchingEntities, entity => 221 | _.find(entity.mentions, { type: 'PROPER' }) 222 | ); 223 | 224 | return _.map(matchingEntities, speaker => ({ name: speaker.name })); 225 | }, 226 | 227 | /** 228 | * Enrich all videos in the list and return the enriched list 229 | * @param {Array} videos List of videos 230 | * @returns {Array} Enriched list of videos 231 | **/ 232 | async enrichVideos(videos) { 233 | const grabCache = this.cache.grab; 234 | const enrichVideo = this.enrichVideo; 235 | const releaseCache = this.cache.release; 236 | 237 | pulse.emit('enrich:start', { videoCount: videos.length }); 238 | const shouldUseCache = globals.readFromCache(); 239 | if (shouldUseCache) { 240 | await grabCache(); 241 | } 242 | 243 | const newVideos = await pMap(videos, async video => { 244 | const newVideo = await enrichVideo(video); 245 | pulse.emit('enrich:chunk'); 246 | return newVideo; 247 | }); 248 | 249 | if (shouldUseCache) { 250 | await releaseCache(); 251 | } 252 | 253 | pulse.emit('enrich:end'); 254 | return newVideos; 255 | }, 256 | }; 257 | 258 | export default _.bindAll(module, _.functions(module)); 259 | -------------------------------------------------------------------------------- /src/__tests__/transformer.js: -------------------------------------------------------------------------------- 1 | import module from '../transformer'; 2 | import helper from '../test-helper'; 3 | const mock = helper.mock(module); 4 | 5 | jest.mock('../language'); 6 | import language from '../language'; 7 | 8 | describe('transform', () => { 9 | describe('getCaptionUrl', () => { 10 | it('should get a url that goes direction to the start', () => { 11 | const actual = module.getCaptionUrl('foo', 42); 12 | 13 | expect(actual).toEqual('https://www.youtube.com/watch?v=foo&t=42s'); 14 | }); 15 | 16 | it('should have a default url if no caption', () => { 17 | const actual = module.getCaptionUrl('foo'); 18 | 19 | expect(actual).toEqual('https://www.youtube.com/watch?v=foo'); 20 | }); 21 | 22 | it('should use the default url if starts at 0s', () => { 23 | const actual = module.getCaptionUrl('foo', 0); 24 | 25 | expect(actual).toEqual('https://www.youtube.com/watch?v=foo'); 26 | }); 27 | }); 28 | 29 | describe('getCaptionDetails', () => { 30 | it('should set the starting second', () => { 31 | const input = { start: 5.7 }; 32 | 33 | const actual = module.getCaptionDetails(input); 34 | 35 | expect(actual).toHaveProperty('start', 5); 36 | }); 37 | 38 | it('should set the content', () => { 39 | const input = { content: 'foo' }; 40 | 41 | const actual = module.getCaptionDetails(input); 42 | 43 | expect(actual).toHaveProperty('content', 'foo'); 44 | }); 45 | 46 | it('should set the position', () => { 47 | const input = {}; 48 | 49 | const actual = module.getCaptionDetails(input, 42); 50 | 51 | expect(actual).toHaveProperty('position', 42); 52 | }); 53 | 54 | it('should set the url', () => { 55 | const caption = { start: 5.86 }; 56 | const videoId = 'foo'; 57 | 58 | const actual = module.getCaptionDetails(caption, 42, videoId); 59 | 60 | expect(actual).toHaveProperty( 61 | 'url', 62 | 'https://www.youtube.com/watch?v=foo&t=5s' 63 | ); 64 | }); 65 | 66 | it('should return a default caption if no input caption', () => { 67 | const actual = module.getCaptionDetails(undefined, 42, 'foo'); 68 | 69 | expect(actual).toHaveProperty('content', null); 70 | expect(actual).toHaveProperty('duration', 0); 71 | expect(actual).toHaveProperty('start', 0); 72 | expect(actual).toHaveProperty('position', 42); 73 | expect(actual).toHaveProperty( 74 | 'url', 75 | 'https://www.youtube.com/watch?v=foo' 76 | ); 77 | }); 78 | }); 79 | 80 | describe('recordsFromVideo', () => { 81 | it('get one record per caption', () => { 82 | const input = { 83 | captions: [{ content: 'foo' }, { content: 'bar' }], 84 | }; 85 | 86 | const actual = module.recordsFromVideo(input); 87 | 88 | expect(actual).toHaveLength(2); 89 | }); 90 | 91 | it('sets the caption details', () => { 92 | mock('getCaptionDetails', { content: 'bar' }); 93 | const input = { 94 | captions: [{ content: 'foo' }], 95 | }; 96 | 97 | const actual = module.recordsFromVideo(input); 98 | 99 | expect(actual[0]).toHaveProperty('caption.content', 'bar'); 100 | }); 101 | 102 | it('contains the video, playlist and channel info', () => { 103 | const input = { 104 | video: { id: 'foo' }, 105 | playlist: { id: 'bar' }, 106 | channel: { id: 'baz' }, 107 | }; 108 | 109 | const actual = module.recordsFromVideo(input); 110 | 111 | expect(actual[0]).toHaveProperty('video.id', 'foo'); 112 | expect(actual[0]).toHaveProperty('playlist.id', 'bar'); 113 | expect(actual[0]).toHaveProperty('channel.id', 'baz'); 114 | }); 115 | 116 | it('still create a record if no captions', () => { 117 | const input = { video: { id: 'foo' } }; 118 | 119 | const actual = module.recordsFromVideo(input); 120 | 121 | expect(actual).toHaveLength(1); 122 | expect(actual[0]).toHaveProperty('video.id', 'foo'); 123 | }); 124 | 125 | it('set the popularity score to each record', () => { 126 | mock('getPopularityScore', 1234); 127 | const input = { video: { id: 'foo' } }; 128 | 129 | const actual = module.recordsFromVideo(input); 130 | 131 | expect(actual[0]).toHaveProperty('video.popularity.score', 1234); 132 | }); 133 | 134 | it('set the bucketed published date score to each record', () => { 135 | mock('getBucketedDate', { day: 1, year: 4 }); 136 | const input = { video: { id: 'foo' } }; 137 | 138 | const actual = module.recordsFromVideo(input); 139 | 140 | expect(actual[0]).toHaveProperty('video.publishedDate.day', 1); 141 | expect(actual[0]).toHaveProperty('video.publishedDate.year', 4); 142 | }); 143 | }); 144 | 145 | describe('getPopularityScore', () => { 146 | it('sums up all interactions', () => { 147 | const input = { 148 | popularity: { 149 | comments: 1, 150 | dislikes: 2, 151 | favorites: 3, 152 | likes: 4, 153 | views: 5, 154 | }, 155 | }; 156 | 157 | const actual = module.getPopularityScore(input); 158 | 159 | expect(actual).toEqual(15); 160 | }); 161 | }); 162 | 163 | describe('getBucketedDate', () => { 164 | it('sums up all interactions', () => { 165 | const input = 1521217180; 166 | 167 | const actual = module.getBucketedDate(input); 168 | 169 | expect(actual).toHaveProperty('year', 1514761200); 170 | expect(actual).toHaveProperty('month', 1519858800); 171 | expect(actual).toHaveProperty('day', 1521154800); 172 | expect(actual).toHaveProperty('timestamp', input); 173 | }); 174 | }); 175 | 176 | describe('guessConferenceYear', () => { 177 | it('should guess the year from the playlist title', () => { 178 | const video = { 179 | playlist: { 180 | title: 'Awesome Conference 2018', 181 | }, 182 | }; 183 | 184 | const actual = module.guessConferenceYear(video); 185 | 186 | expect(actual).toEqual(2018); 187 | }); 188 | 189 | it('should return null if no playlist title', () => { 190 | const video = {}; 191 | 192 | const actual = module.guessConferenceYear(video); 193 | 194 | expect(actual).toEqual(null); 195 | }); 196 | 197 | it('should return null if playlist title not parseable', () => { 198 | const video = { 199 | playlist: { 200 | title: 'Not an interesting title', 201 | }, 202 | }; 203 | 204 | const actual = module.guessConferenceYear(video); 205 | 206 | expect(actual).toEqual(null); 207 | }); 208 | 209 | describe('real examples', () => { 210 | describe('saastr', () => { 211 | it('2018', () => { 212 | const input = { 213 | playlist: { 214 | title: 'SaaStr Annual 2018: (Some Of) Best Of', 215 | }, 216 | }; 217 | 218 | const actual = module.guessConferenceYear(input); 219 | 220 | expect(actual).toEqual(2018); 221 | }); 222 | 223 | it('2017', () => { 224 | const input = { 225 | playlist: { 226 | title: 'SaaStr Annual 2017 Sessions', 227 | }, 228 | }; 229 | 230 | const actual = module.guessConferenceYear(input); 231 | 232 | expect(actual).toEqual(2017); 233 | }); 234 | 235 | it('2016', () => { 236 | const input = { 237 | playlist: { 238 | title: 'SaaStr Annual 2016', 239 | }, 240 | }; 241 | 242 | const actual = module.guessConferenceYear(input); 243 | 244 | expect(actual).toEqual(2016); 245 | }); 246 | }); 247 | }); 248 | }); 249 | 250 | describe('enrichVideos', () => { 251 | beforeEach(() => { 252 | // Enriching through language won't do anything by default 253 | language.enrichVideos.mockImplementation(args => args); 254 | }); 255 | 256 | it('enrich videos through language', async () => { 257 | const input = [{ foo: 'bar' }]; 258 | language.enrichVideos.mockImplementation(videos => [ 259 | { ...videos[0], language: true }, 260 | ]); 261 | 262 | const actual = await module.enrichVideos(input); 263 | 264 | expect(actual[0]).toHaveProperty('language', true); 265 | }); 266 | 267 | it('add conference year', async () => { 268 | const input = [{ foo: 'bar' }]; 269 | mock('guessConferenceYear', 2018); 270 | 271 | const actual = await module.enrichVideos(input); 272 | 273 | expect(actual[0]).toHaveProperty('conference.year', 2018); 274 | }); 275 | }); 276 | }); 277 | -------------------------------------------------------------------------------- /src/__tests__/language.js: -------------------------------------------------------------------------------- 1 | import module from '../language'; 2 | import helper from '../test-helper'; 3 | const mock = helper.mock(module); 4 | const mockCache = helper.mock(module.cache); 5 | 6 | jest.mock('../disk-logger'); 7 | 8 | jest.mock('../globals'); 9 | import globals from '../globals'; 10 | 11 | jest.mock('../fileutils'); 12 | import fileutils from '../fileutils'; 13 | 14 | jest.mock('../pulse'); 15 | import pulse from '../pulse'; 16 | pulse.emit = jest.fn(); 17 | 18 | describe('language', () => { 19 | describe('cache', () => { 20 | beforeEach(() => { 21 | module.cache.set({}); 22 | }); 23 | 24 | describe('grab', () => { 25 | it('should set the CACHE to the value of the file on disk', async () => { 26 | fileutils.readJson.mockReturnValue({ foo: 'bar' }); 27 | 28 | await module.cache.grab(); 29 | const actual = module.cache.get(); 30 | 31 | expect(actual).toHaveProperty('foo', 'bar'); 32 | }); 33 | 34 | it('should set the CACHE to {} if no file on disk', async () => { 35 | fileutils.readJson.mockReturnValue(null); 36 | 37 | await module.cache.grab(); 38 | const actual = module.cache.get(); 39 | 40 | expect(actual).toEqual({}); 41 | }); 42 | }); 43 | 44 | describe('release', () => { 45 | it('should write the content of CACHE to disk', async () => { 46 | module.cache.set({ foo: 'bar' }); 47 | mockCache('filepath', './path/to/cache.json'); 48 | 49 | await module.cache.release(); 50 | 51 | expect(fileutils.writeJson).toHaveBeenCalledWith( 52 | './path/to/cache.json', 53 | { 54 | foo: 'bar', 55 | } 56 | ); 57 | }); 58 | }); 59 | 60 | describe('read', () => { 61 | describe('cache enabled', () => { 62 | beforeEach(() => { 63 | globals.readFromCache.mockReturnValue(true); 64 | }); 65 | 66 | it('should return the cache value if available', () => { 67 | module.cache.set({ foo: [{ input: 'bar', entities: 'baz' }] }); 68 | const actual = module.cache.read('foo', 'bar'); 69 | 70 | expect(actual).toEqual('baz'); 71 | }); 72 | 73 | it('should return false if no such video if', () => { 74 | module.cache.set({ foo: [{ input: 'bar', entities: 'baz' }] }); 75 | const actual = module.cache.read('fee', 'bar'); 76 | 77 | expect(actual).toEqual(false); 78 | }); 79 | 80 | it('should return false if no such input', () => { 81 | module.cache.set({ foo: [{ input: 'bar', entities: 'baz' }] }); 82 | const actual = module.cache.read('foo', 'fee'); 83 | 84 | expect(actual).toEqual(false); 85 | }); 86 | }); 87 | 88 | describe('cache disabled', () => { 89 | beforeEach(() => { 90 | globals.readFromCache.mockReturnValue(false); 91 | }); 92 | it('should return false', () => { 93 | const actual = module.cache.read('foo', 'bar'); 94 | 95 | expect(actual).toEqual(false); 96 | }); 97 | it('should return false even if their is a match in the cache', () => { 98 | module.cache.set({ foo: [{ input: 'bar', entities: 'baz' }] }); 99 | const actual = module.cache.read('foo', 'bar'); 100 | 101 | expect(actual).toEqual(false); 102 | }); 103 | }); 104 | }); 105 | 106 | describe('write', () => { 107 | describe('cache enabled', () => { 108 | beforeEach(() => { 109 | globals.readFromCache.mockReturnValue(true); 110 | }); 111 | 112 | it('should add a new input to existing video', () => { 113 | module.cache.set({ foo: [] }); 114 | 115 | module.cache.write('foo', 'bar', 'baz'); 116 | 117 | const actual = module.cache.get(); 118 | 119 | expect(actual).toHaveProperty('foo', [ 120 | { input: 'bar', entities: 'baz' }, 121 | ]); 122 | }); 123 | 124 | it('should add a new video and input', () => { 125 | module.cache.set({}); 126 | 127 | module.cache.write('foo', 'bar', 'baz'); 128 | 129 | const actual = module.cache.get(); 130 | 131 | expect(actual).toHaveProperty('foo', [ 132 | { input: 'bar', entities: 'baz' }, 133 | ]); 134 | }); 135 | }); 136 | 137 | describe('cache disabled', () => { 138 | beforeEach(() => { 139 | globals.readFromCache.mockReturnValue(false); 140 | }); 141 | 142 | it('should not update the cache', () => { 143 | module.cache.set({}); 144 | 145 | module.cache.write('foo', 'bar', 'baz'); 146 | 147 | const actual = module.cache.get(); 148 | 149 | expect(actual).toEqual({}); 150 | }); 151 | }); 152 | }); 153 | }); 154 | 155 | describe('enrichVideo', () => { 156 | it('should set the speakers to the list of speakers', async () => { 157 | mock('getSpeakers', 'my_speakers'); 158 | const input = {}; 159 | 160 | const actual = await module.enrichVideo(input); 161 | 162 | expect(actual).toHaveProperty('speakers', 'my_speakers'); 163 | }); 164 | }); 165 | 166 | describe('getEntities', () => { 167 | it('should return cache value if exists', async () => { 168 | mockCache('read', 'foo'); 169 | 170 | const actual = await module.getEntities('anything', 'anything'); 171 | 172 | expect(actual).toEqual('foo'); 173 | }); 174 | 175 | it('should return entities as returned by the API', async () => { 176 | mockCache('read'); 177 | const mockAnalyzeEntities = jest 178 | .fn() 179 | .mockReturnValue([{ entities: 'foo' }]); 180 | mock('client', { 181 | analyzeEntities: mockAnalyzeEntities, 182 | }); 183 | 184 | const actual = await module.getEntities('videoId', 'my sentence'); 185 | 186 | expect(mockAnalyzeEntities).toHaveBeenCalledWith({ 187 | document: { 188 | content: 'my sentence', 189 | type: 'PLAIN_TEXT', 190 | }, 191 | }); 192 | expect(actual).toEqual('foo'); 193 | }); 194 | 195 | it('should save value to cache', async () => { 196 | mockCache('write'); 197 | const mockAnalyzeEntities = jest 198 | .fn() 199 | .mockReturnValue([{ entities: 'foo' }]); 200 | mock('client', { 201 | analyzeEntities: mockAnalyzeEntities, 202 | }); 203 | 204 | await module.getEntities('videoId', 'my sentence'); 205 | 206 | expect(module.cache.write).toHaveBeenCalledWith( 207 | 'videoId', 208 | 'my sentence', 209 | 'foo' 210 | ); 211 | }); 212 | 213 | it('should emit a warning if the client throws an error ', async () => { 214 | mock('client', { 215 | analyzeEntities() { 216 | const err = new Error(); 217 | err.details = 'details'; 218 | throw err; 219 | }, 220 | }); 221 | const mockHandleErrors = mock('handleGoogleApiErrors'); 222 | 223 | const actual = await module.getEntities('my_video', 'anything'); 224 | 225 | expect(actual).toEqual([]); 226 | expect(mockHandleErrors).toHaveBeenCalled(); 227 | }); 228 | 229 | describe('getSpeakers', () => { 230 | it('should get entities for the specified title and id', async () => { 231 | const input = { 232 | video: { 233 | title: 'title', 234 | id: 'videoId', 235 | }, 236 | }; 237 | const mockGetEntities = mock('getEntities'); 238 | 239 | await module.getSpeakers(input); 240 | 241 | expect(mockGetEntities).toHaveBeenCalledWith('videoId', 'title'); 242 | }); 243 | 244 | it('should return all speakers', async () => { 245 | mock('getEntities', [ 246 | { 247 | type: 'PERSON', 248 | mentions: [{ type: 'PROPER' }], 249 | name: 'Tim Carry', 250 | }, 251 | { 252 | type: 'PERSON', 253 | mentions: [{ type: 'COMMON' }], 254 | name: 'CEO', 255 | }, 256 | { 257 | type: 'PERSON', 258 | mentions: [{ type: 'PROPER' }], 259 | name: 'John Doe', 260 | }, 261 | { 262 | type: 'LOCATION', 263 | name: 'Tel Aviv', 264 | }, 265 | ]); 266 | 267 | const actual = await module.getSpeakers(); 268 | 269 | expect(actual[0]).toHaveProperty('name', 'Tim Carry'); 270 | expect(actual[1]).toHaveProperty('name', 'John Doe'); 271 | }); 272 | }); 273 | }); 274 | 275 | describe('handleGoogleApiErrors', () => { 276 | it('should emit a warning for regular errors', () => { 277 | const error = { message: 'Foo bar' }; 278 | const videoId = 'foo'; 279 | 280 | module.handleGoogleApiErrors(error, videoId); 281 | 282 | expect(pulse.emit).toHaveBeenCalledWith( 283 | 'warning', 284 | 'Foo bar', 285 | 'https://youtu.be/foo' 286 | ); 287 | }); 288 | it('should stop the process if missing credentials', () => { 289 | const error = { 290 | message: 'blablah Could not load the default credentials blablah', 291 | }; 292 | const videoId = 'foo'; 293 | const mockStopProcess = mock('stopProcess'); 294 | const mockConsole = jest 295 | .spyOn(global.console, 'info') 296 | .mockImplementation(); 297 | 298 | module.handleGoogleApiErrors(error, videoId); 299 | 300 | expect(mockStopProcess).toHaveBeenCalled(); 301 | expect(mockConsole).toHaveBeenCalled(); 302 | }); 303 | }); 304 | 305 | describe('enrichVideos', () => { 306 | describe('with cache enabled', () => { 307 | beforeEach(() => { 308 | globals.readFromCache.mockReturnValue(true); 309 | }); 310 | 311 | it('should call grabCache and releaseCache', async () => { 312 | const mockGrab = mockCache('grab'); 313 | const mockRelease = mockCache('release'); 314 | 315 | const input = []; 316 | await module.enrichVideos(input); 317 | 318 | expect(mockGrab).toHaveBeenCalled(); 319 | expect(mockRelease).toHaveBeenCalled(); 320 | }); 321 | }); 322 | 323 | describe('with cache disabled', () => { 324 | beforeEach(() => { 325 | globals.readFromCache.mockReturnValue(false); 326 | }); 327 | 328 | it('should not call grabCache and releaseCache', async () => { 329 | const mockGrab = mockCache('grab'); 330 | const mockRelease = mockCache('release'); 331 | 332 | const input = []; 333 | await module.enrichVideos(input); 334 | 335 | expect(mockGrab).not.toHaveBeenCalled(); 336 | expect(mockRelease).not.toHaveBeenCalled(); 337 | }); 338 | }); 339 | 340 | it('should call enrichVideo on each video', async () => { 341 | const input = [{ name: 'foo' }, { name: 'bar' }]; 342 | const mockEnrichVideo = mock('enrichVideo'); 343 | mockEnrichVideo.mockImplementation(video => ({ ...video, done: true })); 344 | 345 | const actual = await module.enrichVideos(input); 346 | 347 | expect(mockEnrichVideo).toHaveBeenCalledWith({ name: 'foo' }); 348 | expect(mockEnrichVideo).toHaveBeenCalledWith({ name: 'bar' }); 349 | expect(actual[0]).toHaveProperty('done', true); 350 | expect(actual[1]).toHaveProperty('done', true); 351 | }); 352 | }); 353 | }); 354 | -------------------------------------------------------------------------------- /src/youtube.js: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import cheerio from 'cheerio'; 3 | import dayjs from 'dayjs'; 4 | import diskLogger from './disk-logger'; 5 | import fileutils from './fileutils'; 6 | import globals from './globals'; 7 | import pMap from 'p-map'; 8 | import parseIsoDuration from 'parse-iso-duration'; 9 | import pulse from './pulse'; 10 | import qs from 'query-string'; 11 | import _ from 'lodash'; 12 | import { forEach, map } from 'p-iteration'; 13 | 14 | const module = { 15 | /** 16 | * Call a Youtube API endpoint with GET parameters 17 | * 18 | * @param {String} endpoint The /endpoint to call 19 | * @param {Object} params The parameters to pass 20 | * @returns {Promise.} The data returned by the call 21 | **/ 22 | async get(endpoint, params) { 23 | try { 24 | const options = { 25 | baseURL: 'https://www.googleapis.com/youtube/v3', 26 | url: endpoint, 27 | params: { 28 | key: globals.youtubeApiKey(), 29 | ...params, 30 | }, 31 | }; 32 | const results = await axios(options); 33 | return results.data; 34 | } catch (err) { 35 | pulse.emit('error', err, `get/${endpoint}/${JSON.stringify(params)}`); 36 | return {}; 37 | } 38 | }, 39 | /** 40 | * Return details about a specific playlist 41 | * 42 | * @param {String} playlistId The playlist id 43 | * @returns {Promise.} The playlist data 44 | **/ 45 | async getPlaylistData(playlistId) { 46 | try { 47 | const response = await this.get('playlists', { 48 | id: playlistId, 49 | part: 'snippet', 50 | }); 51 | diskLogger.write(`playlist/${playlistId}.json`, response); 52 | 53 | const playlistData = response.items[0]; 54 | return { 55 | id: playlistId, 56 | title: playlistData.snippet.title, 57 | description: playlistData.snippet.description, 58 | }; 59 | } catch (err) { 60 | pulse.emit('error', err, `getPlaylistData(${playlistId})`); 61 | return {}; 62 | } 63 | }, 64 | /** 65 | * Returns a list of all videos from a specific playlist 66 | * 67 | * @param {String} playlistId The id of the playlist 68 | * @returns {Promise.} A list of all videos in a playlist 69 | * 70 | * It can only get up to 50 videos per page in one call. It will browse all 71 | * pages to get all videos. 72 | **/ 73 | async getVideosFromPlaylist(playlistId) { 74 | try { 75 | const resultsPerPage = 50; 76 | const playlistData = await this.getPlaylistData(playlistId); 77 | let pageToken = null; 78 | let videos = []; 79 | let page = 1; 80 | do { 81 | // Get list of all videos in the playlist 82 | const pageItems = await this.get('playlistItems', { 83 | playlistId, 84 | maxResults: resultsPerPage, 85 | pageToken, 86 | part: 'snippet,contentDetails', 87 | }); 88 | 89 | diskLogger.write( 90 | `playlistItems/${playlistId}-page-${page}.json`, 91 | pageItems 92 | ); 93 | if (page === 1) { 94 | pulse.emit('playlist:start', { 95 | playlistId, 96 | totalVideoCount: _.get(pageItems, 'pageInfo.totalResults'), 97 | }); 98 | } 99 | 100 | const pageVideos = await this.getVideosFromPlaylistPage(pageItems); 101 | pulse.emit('playlist:chunk', { 102 | playlistId, 103 | chunkVideoCount: pageVideos.length, 104 | }); 105 | videos = _.concat(videos, pageVideos); 106 | 107 | pageToken = pageItems.nextPageToken; 108 | page++; 109 | } while (pageToken); 110 | 111 | // Adding playlist information to all videos 112 | videos = _.map(videos, video => ({ 113 | ...video, 114 | playlist: playlistData, 115 | })); 116 | 117 | pulse.emit('playlist:end', { videos }); 118 | 119 | return videos; 120 | } catch (err) { 121 | pulse.emit('error', err, `getVideosFromPlaylist(${playlistId})`); 122 | return []; 123 | } 124 | }, 125 | /** 126 | * Given a playlist page, returns the list of videos of this page, along with 127 | * their details 128 | * @param {Object} pageResults The playlist page, as returned by the 129 | * /playlistItems endpoint 130 | * @returns {Promise.} An array of all videos, along with details 131 | * 132 | * Note that some videos returned by the playlistItems might be private. In that 133 | * case, we won't get any additional data for those videos, so we'll remove them 134 | * from the returned videos. 135 | **/ 136 | async getVideosFromPlaylistPage(pageResults) { 137 | // Page results will give us the videoId and matching position in playlist 138 | const allVideoInfoFromPage = {}; 139 | const blockList = _.get(globals.config(), 'blockList', []); 140 | _.each(pageResults.items, video => { 141 | const videoId = _.get(video, 'contentDetails.videoId'); 142 | // Skipping videos that should be excluded 143 | if (_.includes(blockList, videoId)) { 144 | return; 145 | } 146 | 147 | const positionInPlaylist = _.get(video, 'snippet.position'); 148 | 149 | // Some videos are sometimes set several times in the same playlist page, 150 | // resulting in final count being wrong 151 | if (allVideoInfoFromPage[videoId]) { 152 | const initialPosition = 153 | allVideoInfoFromPage[videoId].video.positionInPlaylist; 154 | const newPosition = positionInPlaylist; 155 | pulse.emit( 156 | 'warning', 157 | 'Some videos are added several times to the same playlist', 158 | `https://youtu.be/${videoId} at position ${initialPosition} and ${newPosition}` 159 | ); 160 | } 161 | 162 | allVideoInfoFromPage[videoId] = { 163 | video: { 164 | id: videoId, 165 | positionInPlaylist, 166 | }, 167 | }; 168 | }); 169 | 170 | // We also need more detailed information about each video 171 | const videoPageIds = _.keys(allVideoInfoFromPage); 172 | const videoDetails = await this.getVideosData(videoPageIds); 173 | 174 | // If we don't have all the details for all video, we issue a warning 175 | const videoDetailsIds = _.keys(videoDetails); 176 | if (videoDetailsIds.length !== videoPageIds.length) { 177 | const excludedIds = _.difference(videoPageIds, videoDetailsIds); 178 | pulse.emit( 179 | 'warning', 180 | 'Unable to get details for the following videos', 181 | _.map(excludedIds, id => `https://youtu.be/${id}`) 182 | ); 183 | } 184 | 185 | // Discarding videos where we don't have any data and merging together 186 | const selectedVideoInfoFromPage = _.pick( 187 | allVideoInfoFromPage, 188 | videoDetailsIds 189 | ); 190 | const newVideos = _.values( 191 | _.merge(videoDetails, selectedVideoInfoFromPage) 192 | ); 193 | 194 | return newVideos; 195 | }, 196 | /** 197 | * Returns details about specific videos 198 | * 199 | * @param {Array.} userVideoId The array of ids of the 200 | * video to get data from 201 | * @returns {Promise.} An object where each key is a video id and each 202 | * value its detailed information 203 | **/ 204 | async getVideosData(userVideoId) { 205 | try { 206 | const parts = ['contentDetails', 'snippet', 'statistics', 'status'].join( 207 | ',' 208 | ); 209 | let videoIds = userVideoId; 210 | if (!_.isArray(videoIds)) { 211 | videoIds = [videoIds]; 212 | } 213 | 214 | const response = await this.get('videos', { 215 | id: videoIds.join(','), 216 | part: parts, 217 | }); 218 | diskLogger.write( 219 | `videos/${_.first(videoIds)}-to-${_.last(videoIds)}.json`, 220 | response 221 | ); 222 | 223 | const items = _.get(response, 'items', []); 224 | const videoData = {}; 225 | await pMap(items, async data => { 226 | const videoId = data.id; 227 | const defaultAudioLanguage = _.get( 228 | data, 229 | 'snippet.defaultAudioLanguage' 230 | ); 231 | const captions = await this.getCaptions(videoId, defaultAudioLanguage); 232 | 233 | const channelMetadata = this.formatChannel(data); 234 | const videoMetadata = this.formatVideo(data, captions); 235 | 236 | videoData[videoId] = { 237 | channel: channelMetadata, 238 | video: videoMetadata, 239 | captions, 240 | }; 241 | }); 242 | 243 | return videoData; 244 | } catch (err) { 245 | pulse.emit('error', err, `getVideosData(${userVideoId})`); 246 | return {}; 247 | } 248 | }, 249 | 250 | async getVideosFromCache() { 251 | const config = globals.config(); 252 | const configName = globals.configName(); 253 | const playlists = config.playlists; 254 | const blockList = config.blockList; 255 | 256 | const playlistGlob = 257 | playlists.length === 1 258 | ? `${playlists[0]}.json` 259 | : `{${playlists.join(',')}}.json`; 260 | 261 | const playlistFiles = await fileutils.glob( 262 | `./cache/${configName}/youtube/${playlistGlob}` 263 | ); 264 | let videos = _.flatten(await map(playlistFiles, fileutils.readJson)); 265 | 266 | // Remove videos that are part of the blocklist 267 | if (blockList) { 268 | videos = _.reject(videos, video => 269 | _.includes(blockList, _.get(video, 'video.id')) 270 | ); 271 | } 272 | 273 | return videos; 274 | }, 275 | 276 | async getVideosFromApi() { 277 | const config = globals.config(); 278 | const configName = globals.configName(); 279 | const playlists = config.playlists; 280 | 281 | pulse.emit('youtube:crawling:start', { playlists }); 282 | 283 | const allVideos = []; 284 | await forEach(playlists, async playlistId => { 285 | const videos = await this.getVideosFromPlaylist(playlistId); 286 | 287 | await fileutils.writeJson( 288 | `./cache/${configName}/youtube/${playlistId}.json`, 289 | videos 290 | ); 291 | 292 | allVideos.push(videos); 293 | }); 294 | 295 | pulse.emit('youtube:crawling:end'); 296 | return _.flatten(allVideos); 297 | }, 298 | 299 | /** 300 | * Extract hasCaptions and hasManualCaptions from the data received from the 301 | * API. 302 | * @param {Object} data Video data object as received by the API 303 | * @param {Array} captions The array of captions 304 | * @return {Object} Object containing boolean keys .hasCaptions and 305 | * .hasManualCaptions 306 | **/ 307 | formatCaptions(data, captions) { 308 | const hasCaptions = captions.length > 0; 309 | const hasManualCaptions = _.get(data, 'contentDetails.caption') === 'true'; 310 | return { hasCaptions, hasManualCaptions }; 311 | }, 312 | 313 | /** 314 | * Format the statistics as returned by the API into an object 315 | * @param {Object} data Video data object as received by the API 316 | * @return {Object} Object containing .views, .likes, .dislikes, .favorites, 317 | * .comments counts as numbers 318 | **/ 319 | formatPopularity(data) { 320 | const viewCount = _.parseInt(_.get(data, 'statistics.viewCount')); 321 | const likeCount = _.parseInt(_.get(data, 'statistics.likeCount')); 322 | const dislikeCount = _.parseInt(_.get(data, 'statistics.dislikeCount')); 323 | const favoriteCount = _.parseInt(_.get(data, 'statistics.favoriteCount')); 324 | const commentCount = _.parseInt(_.get(data, 'statistics.commentCount')); 325 | return { 326 | views: viewCount, 327 | likes: likeCount, 328 | dislikes: dislikeCount, 329 | favorites: favoriteCount, 330 | comments: commentCount, 331 | }; 332 | }, 333 | 334 | /** 335 | * Format the duration as returned by the API into an object 336 | * @param {Object} data Video data object as received by the API 337 | * @return {Object} Object containing a .minutes and .seconds keys 338 | **/ 339 | formatDuration(data) { 340 | const durationInSeconds = 341 | parseIsoDuration(_.get(data, 'contentDetails.duration')) / 1000; 342 | return { 343 | minutes: Math.floor(durationInSeconds / 60), 344 | seconds: durationInSeconds % 60, 345 | }; 346 | }, 347 | 348 | formatChannel(data) { 349 | return { 350 | id: _.get(data, 'snippet.channelId'), 351 | title: _.get(data, 'snippet.channelTitle'), 352 | }; 353 | }, 354 | 355 | formatVideo(data, captions) { 356 | const videoId = data.id; 357 | const captionsMetadata = this.formatCaptions(data, captions); 358 | const popularity = this.formatPopularity(data); 359 | const duration = this.formatDuration(data); 360 | const publishedDate = dayjs(_.get(data, 'snippet.publishedAt')).unix(); 361 | const url = `https://www.youtube.com/watch?v=${videoId}`; 362 | 363 | return { 364 | id: videoId, 365 | title: _.get(data, 'snippet.title'), 366 | description: _.get(data, 'snippet.description'), 367 | thumbnails: _.get(data, 'snippet.thumbnails'), 368 | languageCode: _.get(data, 'snippet.defaultAudioLanguage'), 369 | publishedDate, 370 | popularity, 371 | duration, 372 | url, 373 | ...captionsMetadata, 374 | }; 375 | }, 376 | 377 | /** 378 | * Get raw information about a YouTube video. 379 | * 380 | * @param {String} videoId Id of the video 381 | * @returns {Object} Raw data about the video 382 | * 383 | * Note: This call does not use the API,but a rather obscure, undocumented, 384 | * endpoint. The data returned itself is in a variety of formats that has to be 385 | * parsed to make a cohesive object. 386 | * TOTEST 387 | **/ 388 | async getRawVideoInfo(videoId) { 389 | /* eslint-disable camelcase */ 390 | try { 391 | const options = { 392 | url: 'http://www.youtube.com/get_video_info', 393 | params: { 394 | video_id: videoId, 395 | }, 396 | }; 397 | 398 | const results = await axios(options); 399 | diskLogger.write(`get_video_info/${videoId}.txt`, results.data); 400 | 401 | const params = qs.parse(results.data); 402 | params.adaptive_fmts = qs.parse(params.adaptive_fmts); 403 | params.atc = qs.parse(params.atc); 404 | params.fflags = qs.parse(params.fflags); 405 | params.player_response = JSON.parse(params.player_response); 406 | params.url_encoded_fmt_stream_map = qs.parse( 407 | params.url_encoded_fmt_stream_map 408 | ); 409 | diskLogger.write(`get_video_info/${videoId}.json`, params); 410 | return params; 411 | } catch (err) { 412 | pulse.emit('error', err, `getRawVideoInfo/${videoId}`); 413 | return {}; 414 | } 415 | /* eslint-enable camelcase */ 416 | }, 417 | 418 | /** 419 | * Get the caption url for a given videoId 420 | * 421 | * @param {String} videoId Id of the video 422 | * @param {String} languageCode Language of the caption 423 | * @returns {String} Url to get the video caption file 424 | **/ 425 | async getCaptionsUrl(videoId, languageCode) { 426 | try { 427 | const rawData = await this.getRawVideoInfo(videoId); 428 | const allCaptions = _.get( 429 | rawData, 430 | 'player_response.captions.playerCaptionsTracklistRenderer.captionTracks' 431 | ); 432 | 433 | // No captions 434 | if (_.isEmpty(allCaptions)) { 435 | return false; 436 | } 437 | 438 | const manualCaptions = _.reject( 439 | allCaptions, 440 | caption => _.get(caption, 'kind') === 'asr' 441 | ); 442 | const automaticCaptions = _.difference(allCaptions, manualCaptions); 443 | 444 | const matchingCaption = 445 | _.find(manualCaptions, { languageCode }) || 446 | _.find(automaticCaptions, { languageCode }) || 447 | _.first(manualCaptions) || 448 | _.first(automaticCaptions); 449 | 450 | return _.get(matchingCaption, 'baseUrl'); 451 | } catch (err) { 452 | pulse.emit('error', err, `getCaptionsUrl(${videoId})`); 453 | return false; 454 | } 455 | }, 456 | 457 | /** 458 | * Get captions for a given videoId 459 | * 460 | * @param {String} videoId Id of the video 461 | * @param {String} languageCode Language of the caption 462 | * @returns {Array} Array of captions 463 | **/ 464 | async getCaptions(videoId, languageCode) { 465 | // Get the content of an XML node, which itself can contain 466 | // HTML-encoded tags 467 | function getContent($node) { 468 | return cheerio.load($node.text()).text(); 469 | } 470 | 471 | try { 472 | const captionUrl = await this.getCaptionsUrl(videoId, languageCode); 473 | 474 | if (!captionUrl) { 475 | pulse.emit( 476 | 'warning', 477 | 'Some videos have no captions', 478 | `https://youtu.be/${videoId}` 479 | ); 480 | return []; 481 | } 482 | 483 | const xml = await axios.get(captionUrl); 484 | diskLogger.write(`captions/${videoId}.xml`, xml.data); 485 | 486 | const $ = cheerio.load(xml.data, { xmlMode: true }); 487 | const texts = $('text'); 488 | const captions = _.map(texts, (node, index) => { 489 | // We take nodes two at a time for the content 490 | const $thisNode = $(node); 491 | const thisContent = getContent($thisNode); 492 | const thisStart = _.round($thisNode.attr('start'), 2); 493 | const thisDuration = parseFloat($thisNode.attr('dur')); 494 | 495 | const $nextNode = $(texts[index + 1]); 496 | const nextContent = getContent($nextNode); 497 | const nextDuration = parseFloat($nextNode.attr('dur') || 0); 498 | 499 | const content = _.trim(`${thisContent} ${nextContent}`); 500 | const duration = _.round(thisDuration + nextDuration, 2); 501 | 502 | return { 503 | content, 504 | languageCode, 505 | start: thisStart, 506 | duration, 507 | }; 508 | }); 509 | 510 | return captions; 511 | } catch (err) { 512 | pulse.emit('error', err, `getCaptions(${videoId})`); 513 | return []; 514 | } 515 | }, 516 | 517 | /** 518 | * Get all videos as configured in the current config 519 | * 520 | * Note: You should always call globals.init(configName) before running this 521 | * method, so it can get all the required data 522 | * 523 | * @returns {Array} All videos of the current config 524 | **/ 525 | async getVideos() { 526 | const shouldReadFromCache = globals.readFromCache(); 527 | 528 | // Get videos either from disk cache or API 529 | const videos = shouldReadFromCache 530 | ? await this.getVideosFromCache() 531 | : await this.getVideosFromApi(); 532 | 533 | pulse.emit('youtube:videos', { videos }); 534 | 535 | return videos; 536 | }, 537 | }; 538 | 539 | export default _.bindAll(module, _.functions(module)); 540 | -------------------------------------------------------------------------------- /src/__tests__/youtube.js: -------------------------------------------------------------------------------- 1 | import module from '../youtube'; 2 | import helper from '../test-helper'; 3 | const mock = helper.mock(module); 4 | 5 | jest.mock('../disk-logger'); 6 | jest.mock('../fileutils'); 7 | 8 | jest.mock('axios'); 9 | import axios from 'axios'; 10 | 11 | jest.mock('../globals'); 12 | import globals from '../globals'; 13 | 14 | jest.mock('../fileutils'); 15 | import fileutils from '../fileutils'; 16 | 17 | jest.mock('../pulse'); 18 | import pulse from '../pulse'; 19 | pulse.emit = jest.fn(); 20 | 21 | const objectContaining = expect.objectContaining; 22 | const anyString = expect.any(String); 23 | 24 | describe('youtube', () => { 25 | describe('formatCaptions', () => { 26 | it('hasCaptions should be false if no caption is found', () => { 27 | const data = {}; 28 | const captions = []; 29 | 30 | const actual = module.formatCaptions(data, captions); 31 | 32 | expect(actual).toHaveProperty('hasCaptions', false); 33 | }); 34 | 35 | it('hasCaptions should be true if at least one caption is found', () => { 36 | const data = {}; 37 | const captions = [{}]; 38 | 39 | const actual = module.formatCaptions(data, captions); 40 | 41 | expect(actual).toHaveProperty('hasCaptions', true); 42 | }); 43 | 44 | it('hasManualCaptions should be true if contentDetails.caption is set to true', () => { 45 | const data = { 46 | contentDetails: { 47 | caption: 'true', 48 | }, 49 | }; 50 | const captions = []; 51 | 52 | const actual = module.formatCaptions(data, captions); 53 | 54 | expect(actual).toHaveProperty('hasManualCaptions', true); 55 | }); 56 | 57 | it('hasManualCaptions should be false if contentDetails.caption is set to false', () => { 58 | const data = { 59 | contentDetails: { 60 | caption: 'false', 61 | }, 62 | }; 63 | const captions = []; 64 | 65 | const actual = module.formatCaptions(data, captions); 66 | 67 | expect(actual).toHaveProperty('hasManualCaptions', false); 68 | }); 69 | }); 70 | 71 | describe('formatChannel', () => { 72 | it('should contain the id and title', () => { 73 | const data = { 74 | snippet: { 75 | channelId: 'foo', 76 | channelTitle: 'bar', 77 | }, 78 | }; 79 | 80 | const actual = module.formatChannel(data); 81 | 82 | expect(actual).toHaveProperty('id', 'foo'); 83 | expect(actual).toHaveProperty('title', 'bar'); 84 | }); 85 | }); 86 | 87 | describe('formatDuration', () => { 88 | it('it should convert the duration a .minutes .seconds object', () => { 89 | const data = { 90 | contentDetails: { 91 | duration: 'PT4M25S', 92 | }, 93 | }; 94 | 95 | const actual = module.formatDuration(data); 96 | 97 | expect(actual).toHaveProperty('minutes', 4); 98 | expect(actual).toHaveProperty('seconds', 25); 99 | }); 100 | }); 101 | 102 | describe('formatPopularity', () => { 103 | it('it should extract popularity counts from the data', () => { 104 | const data = { 105 | statistics: { 106 | commentCount: '5', 107 | dislikeCount: '0', 108 | favoriteCount: '0', 109 | likeCount: '169', 110 | viewCount: '3798', 111 | }, 112 | }; 113 | 114 | const actual = module.formatPopularity(data); 115 | 116 | expect(actual).toHaveProperty('comments', 5); 117 | expect(actual).toHaveProperty('dislikes', 0); 118 | expect(actual).toHaveProperty('favorites', 0); 119 | expect(actual).toHaveProperty('likes', 169); 120 | expect(actual).toHaveProperty('views', 3798); 121 | }); 122 | }); 123 | 124 | describe('formatVideo', () => { 125 | it('should contain the videoId', () => { 126 | mock('formatCaptions'); 127 | mock('formatPopularity'); 128 | mock('formatDuration'); 129 | const data = { 130 | id: 'foo', 131 | }; 132 | 133 | const actual = module.formatVideo(data); 134 | 135 | expect(actual).toHaveProperty('id', 'foo'); 136 | }); 137 | 138 | it('should contain base information', () => { 139 | mock('formatCaptions'); 140 | mock('formatPopularity'); 141 | mock('formatDuration'); 142 | const data = { 143 | snippet: { 144 | title: 'Video title', 145 | description: 'Video description', 146 | thumbnails: 'thumbnails', 147 | defaultAudioLanguage: 'fr', 148 | }, 149 | }; 150 | 151 | const actual = module.formatVideo(data); 152 | 153 | expect(actual).toHaveProperty('title', 'Video title'); 154 | expect(actual).toHaveProperty('description', 'Video description'); 155 | expect(actual).toHaveProperty('thumbnails', 'thumbnails'); 156 | expect(actual).toHaveProperty('languageCode', 'fr'); 157 | }); 158 | 159 | it('should contain extended information', () => { 160 | const data = {}; 161 | mock('formatCaptions', { foo: 'bar' }); 162 | mock('formatPopularity', 'popularity'); 163 | mock('formatDuration', 'duration'); 164 | 165 | const actual = module.formatVideo(data); 166 | 167 | expect(actual).toHaveProperty('popularity', 'popularity'); 168 | expect(actual).toHaveProperty('duration', 'duration'); 169 | expect(actual).toHaveProperty('foo', 'bar'); 170 | }); 171 | 172 | it('should contain published date as timestamp', () => { 173 | mock('formatCaptions'); 174 | mock('formatPopularity'); 175 | mock('formatDuration'); 176 | const data = { 177 | snippet: { 178 | publishedAt: '2018-03-16T16:04:29.000Z', 179 | }, 180 | }; 181 | 182 | const actual = module.formatVideo(data); 183 | 184 | expect(actual).toHaveProperty('publishedDate', 1521216269); 185 | }); 186 | 187 | it('should contain the video url', () => { 188 | mock('formatCaptions'); 189 | mock('formatPopularity'); 190 | mock('formatDuration'); 191 | const data = { 192 | id: 'foo', 193 | }; 194 | 195 | const actual = module.formatVideo(data); 196 | 197 | expect(actual).toHaveProperty( 198 | 'url', 199 | 'https://www.youtube.com/watch?v=foo' 200 | ); 201 | }); 202 | }); 203 | 204 | describe('getCaptions', () => { 205 | it('returns a list of captions', async () => { 206 | mock('getCaptionsUrl', '{caption_url}'); 207 | jest.spyOn(axios, 'get').mockReturnValue({ 208 | data: ` 209 | 210 | foo bar 211 | bar baz 212 | 213 | `, 214 | }); 215 | 216 | const actual = await module.getCaptions(42); 217 | 218 | expect(axios.get).toHaveBeenCalledWith('{caption_url}'); 219 | expect(actual).toHaveLength(2); 220 | expect(actual[0]).toHaveProperty('start', 13.28); 221 | expect(actual[0]).toHaveProperty('duration', 10.75); 222 | expect(actual[0]).toHaveProperty('content', 'foo bar bar baz'); 223 | 224 | expect(actual[1]).toHaveProperty('start', 16.02); 225 | expect(actual[1]).toHaveProperty('duration', 5.25); 226 | expect(actual[1]).toHaveProperty('content', 'bar baz'); 227 | }); 228 | 229 | it('removes HTML from captions', async () => { 230 | mock('getCaptionsUrl', '{caption_url}'); 231 | jest.spyOn(axios, 'get').mockReturnValue({ 232 | data: ` 233 | 234 | <font color="#CCCCCC">foo</font><font color="#E5E5E5"> bar</font> 235 | 236 | `, 237 | }); 238 | 239 | const actual = await module.getCaptions(42); 240 | 241 | expect(actual[0]).toHaveProperty('content', 'foo bar'); 242 | }); 243 | 244 | it('returns an empty array if no url found', async () => { 245 | mock('getCaptionsUrl'); 246 | 247 | const actual = await module.getCaptions(42); 248 | 249 | expect(actual).toEqual([]); 250 | }); 251 | 252 | it('returns an empty array if no captions', async () => { 253 | mock('getCaptionsUrl', '{caption_url}'); 254 | jest.spyOn(axios, 'get').mockReturnValue({ 255 | data: ` 256 | 257 | 258 | `, 259 | }); 260 | 261 | const actual = await module.getCaptions(42); 262 | 263 | expect(actual).toEqual([]); 264 | }); 265 | 266 | it('calls getCaptionsUrl with the specified language', async () => { 267 | mock('getCaptionsUrl'); 268 | 269 | await module.getCaptions(42, 'myLanguage'); 270 | 271 | expect(module.getCaptionsUrl).toHaveBeenCalledWith(42, 'myLanguage'); 272 | }); 273 | }); 274 | 275 | /* eslint-disable camelcase */ 276 | describe('getCaptionsUrl', () => { 277 | function mockCaptions(captionTracks) { 278 | mock('getRawVideoInfo', { 279 | player_response: { 280 | captions: { 281 | playerCaptionsTracklistRenderer: { 282 | captionTracks, 283 | }, 284 | }, 285 | }, 286 | }); 287 | } 288 | 289 | it('should take manual captions in the specified language', async () => { 290 | mockCaptions([ 291 | { baseUrl: 'BAD', languageCode: 'ru', kind: 'asr' }, 292 | { baseUrl: 'BAD', languageCode: 'fr' }, 293 | { baseUrl: 'GOOD', languageCode: 'ru' }, 294 | ]); 295 | 296 | const actual = await module.getCaptionsUrl('anything', 'ru'); 297 | 298 | expect(actual).toEqual('GOOD'); 299 | }); 300 | 301 | it('should then try automatic caption in the specified language', async () => { 302 | mockCaptions([ 303 | { baseUrl: 'BAD', languageCode: 'fr' }, 304 | { baseUrl: 'GOOD', languageCode: 'ru', kind: 'asr' }, 305 | ]); 306 | 307 | const actual = await module.getCaptionsUrl('anything', 'ru'); 308 | 309 | expect(actual).toEqual('GOOD'); 310 | }); 311 | 312 | it('should take the first manual caption if no language specified', async () => { 313 | mockCaptions([ 314 | { baseUrl: 'BAD', languageCode: 'en', kind: 'asr' }, 315 | { baseUrl: 'GOOD', languageCode: 'fr' }, 316 | ]); 317 | 318 | const actual = await module.getCaptionsUrl('anything', 'ru'); 319 | 320 | expect(actual).toEqual('GOOD'); 321 | }); 322 | it('should take the first automatic caption if no language specified and no manual one', async () => { 323 | mockCaptions([ 324 | { baseUrl: 'GOOD', languageCode: 'en', kind: 'asr' }, 325 | { baseUrl: 'BAD', languageCode: 'fr', kind: 'asr' }, 326 | ]); 327 | 328 | const actual = await module.getCaptionsUrl('anything', 'ru'); 329 | 330 | expect(actual).toEqual('GOOD'); 331 | }); 332 | 333 | it('should return false if no captionTracks', async () => { 334 | mock('getRawVideoInfo', { 335 | player_response: { 336 | captions: { 337 | playerCaptionsTracklistRenderer: { 338 | captionTracks: [], 339 | }, 340 | }, 341 | }, 342 | }); 343 | 344 | const actual = await module.getCaptionsUrl(); 345 | 346 | expect(actual).toEqual(false); 347 | }); 348 | }); 349 | /* eslint-enable camelcase */ 350 | 351 | describe('getPlaylistData', () => { 352 | it('return an object with playlist data', async () => { 353 | const playlistId = 42; 354 | mock('get', { 355 | items: [ 356 | { 357 | snippet: { 358 | title: 'foo', 359 | description: 'bar', 360 | }, 361 | }, 362 | ], 363 | }); 364 | 365 | const actual = await module.getPlaylistData(playlistId); 366 | 367 | expect(actual).toHaveProperty('id', 42); 368 | expect(actual).toHaveProperty('title', 'foo'); 369 | expect(actual).toHaveProperty('description', 'bar'); 370 | }); 371 | }); 372 | 373 | describe('getVideos', () => { 374 | beforeEach(() => { 375 | globals.readFromCache.mockReturnValue(false); 376 | globals.config.mockReturnValue({}); 377 | }); 378 | 379 | it('should get videos from API by default', async () => { 380 | mock('getVideosFromApi', 'videos_from_api'); 381 | 382 | const actual = await module.getVideos(); 383 | 384 | expect(actual).toEqual('videos_from_api'); 385 | }); 386 | 387 | it('should get videos from cache if enabled', async () => { 388 | globals.readFromCache.mockReturnValue(true); 389 | mock('getVideosFromCache', 'cached_videos'); 390 | 391 | const actual = await module.getVideos(); 392 | 393 | expect(actual).toEqual('cached_videos'); 394 | }); 395 | 396 | it('should emit a youtube:video event with the videos', async () => { 397 | mock('getVideosFromApi', 'video_list'); 398 | 399 | await module.getVideos(); 400 | 401 | expect(pulse.emit).toHaveBeenCalledWith('youtube:videos', { 402 | videos: 'video_list', 403 | }); 404 | }); 405 | }); 406 | 407 | describe('getVideosData', () => { 408 | it('calls /videos with only one id', async () => { 409 | mock('get'); 410 | const input = 'foo'; 411 | 412 | await module.getVideosData(input); 413 | 414 | expect(module.get).toHaveBeenCalledWith( 415 | 'videos', 416 | objectContaining({ id: 'foo' }) 417 | ); 418 | }); 419 | 420 | it('calls /videos with several ids', async () => { 421 | mock('get'); 422 | const input = ['foo', 'bar']; 423 | 424 | await module.getVideosData(input); 425 | 426 | expect(module.get).toHaveBeenCalledWith( 427 | 'videos', 428 | objectContaining({ id: 'foo,bar' }) 429 | ); 430 | }); 431 | 432 | it('sets the channel key to each video', async () => { 433 | mock('formatVideo'); 434 | mock('get', { items: [{ id: 'foo' }] }); 435 | mock('formatChannel', 'channel info'); 436 | 437 | const actual = await module.getVideosData(); 438 | 439 | expect(actual).toHaveProperty('foo.channel', 'channel info'); 440 | }); 441 | 442 | it('sets the video key to each video', async () => { 443 | mock('formatChannel'); 444 | mock('get', { items: [{ id: 'foo' }] }); 445 | mock('formatVideo', 'video info'); 446 | 447 | const actual = await module.getVideosData(); 448 | 449 | expect(actual).toHaveProperty('foo.video', 'video info'); 450 | }); 451 | 452 | it('sets the .captions key', async () => { 453 | mock('get', { items: [{ id: 'foo' }] }); 454 | mock('formatDuration'); 455 | mock('getCaptions', 'captions'); 456 | 457 | const actual = await module.getVideosData(); 458 | 459 | expect(actual).toHaveProperty('foo.captions', 'captions'); 460 | }); 461 | 462 | it('asks for captions for the video language', async () => { 463 | mock('get', { 464 | items: [{ id: 'foo', snippet: { defaultAudioLanguage: 'ru' } }], 465 | }); 466 | mock('formatDuration'); 467 | mock('getCaptions', 'captions'); 468 | 469 | await module.getVideosData(); 470 | 471 | expect(module.getCaptions).toHaveBeenCalledWith('foo', 'ru'); 472 | }); 473 | }); 474 | 475 | describe('getVideosFromCache', () => { 476 | it('should return all videos from one playlist', async () => { 477 | globals.configName.mockReturnValue('my_config'); 478 | globals.config.mockReturnValue({ playlists: ['foo'] }); 479 | fileutils.glob.mockReturnValue(['cache_file']); 480 | fileutils.readJson.mockReturnValue([{ video: 'video_from_cache' }]); 481 | 482 | const actual = await module.getVideosFromCache(); 483 | 484 | expect(actual).toEqual([{ video: 'video_from_cache' }]); 485 | expect(fileutils.glob).toHaveBeenCalledWith( 486 | './cache/my_config/youtube/foo.json' 487 | ); 488 | }); 489 | 490 | it('should return all videos from several playlists', async () => { 491 | globals.configName.mockReturnValue('my_config'); 492 | globals.config.mockReturnValue({ playlists: ['foo', 'bar'] }); 493 | fileutils.glob.mockReturnValue(['cache_file1', 'cache_file2']); 494 | fileutils.readJson.mockReturnValueOnce([{ video: 'video_from_cache1' }]); 495 | fileutils.readJson.mockReturnValueOnce([{ video: 'video_from_cache2' }]); 496 | 497 | const actual = await module.getVideosFromCache(); 498 | 499 | expect(actual).toEqual([ 500 | { video: 'video_from_cache1' }, 501 | { video: 'video_from_cache2' }, 502 | ]); 503 | }); 504 | 505 | it('should exclude videos from the blockList', async () => { 506 | globals.config.mockReturnValue({ 507 | playlists: ['playlistId'], 508 | blockList: ['bar'], 509 | }); 510 | fileutils.glob.mockReturnValue(['glob_path']); 511 | fileutils.readJson.mockReturnValue([ 512 | { video: { id: 'foo' } }, 513 | { video: { id: 'bar' } }, 514 | ]); 515 | 516 | const actual = await module.getVideosFromCache(); 517 | 518 | expect(actual).toEqual([{ video: { id: 'foo' } }]); 519 | }); 520 | }); 521 | 522 | describe('getVideosFromPlaylist', () => { 523 | it('should get all videos from the unique page', async () => { 524 | mock('getPlaylistData', { nextPageToken: null }); 525 | mock('getVideosFromPlaylistPage', [{ foo: 'bar' }, { bar: 'baz' }]); 526 | 527 | const actual = await module.getVideosFromPlaylist(); 528 | 529 | expect(actual).toHaveLength(2); 530 | expect(actual[0]).toHaveProperty('foo', 'bar'); 531 | expect(actual[1]).toHaveProperty('bar', 'baz'); 532 | }); 533 | 534 | it('should get all videos from several pages', async () => { 535 | mock('getPlaylistData'); 536 | mock('get') 537 | .mockReturnValueOnce({ nextPageToken: 'token' }) 538 | .mockReturnValueOnce({ nextPageToken: null }); 539 | mock('getVideosFromPlaylistPage') 540 | .mockReturnValueOnce([{ key: 'foo' }, { key: 'bar' }]) 541 | .mockReturnValueOnce([{ key: 'baz' }]); 542 | 543 | const actual = await module.getVideosFromPlaylist(); 544 | 545 | expect(actual[0]).toHaveProperty('key', 'foo'); 546 | expect(actual[1]).toHaveProperty('key', 'bar'); 547 | expect(actual[2]).toHaveProperty('key', 'baz'); 548 | }); 549 | 550 | it('should add the playlist data to each item', async () => { 551 | mock('getPlaylistData', 'playlistData'); 552 | mock('get', { nextPageToken: null }); 553 | mock('getVideosFromPlaylistPage', [{ foo: 'bar' }, { bar: 'baz' }]); 554 | 555 | const actual = await module.getVideosFromPlaylist(); 556 | 557 | expect(actual[0]).toHaveProperty('playlist', 'playlistData'); 558 | expect(actual[1]).toHaveProperty('playlist', 'playlistData'); 559 | }); 560 | }); 561 | 562 | describe('getVideosFromPlaylistPage', () => { 563 | it('should reconcile page information and detail information', async () => { 564 | const input = { 565 | items: [ 566 | { 567 | contentDetails: { videoId: 'foo' }, 568 | snippet: { position: 42 }, 569 | }, 570 | ], 571 | }; 572 | mock('getVideosData', { 573 | foo: { 574 | video: { id: 'foo', title: 'foo bar' }, 575 | }, 576 | }); 577 | 578 | const actual = await module.getVideosFromPlaylistPage(input); 579 | 580 | expect(actual[0]).toHaveProperty('video.id', 'foo'); 581 | expect(actual[0]).toHaveProperty('video.positionInPlaylist', 42); 582 | expect(actual[0]).toHaveProperty('video.title', 'foo bar'); 583 | }); 584 | 585 | it('should discard videos excluded by blockList', async () => { 586 | const input = { 587 | items: [ 588 | { 589 | contentDetails: { videoId: 'foo' }, 590 | snippet: { position: 42 }, 591 | }, 592 | { 593 | contentDetails: { videoId: 'bar' }, 594 | snippet: { position: 43 }, 595 | }, 596 | ], 597 | }; 598 | mock('getVideosData', { 599 | foo: { 600 | video: { id: 'bar', title: 'foo bar' }, 601 | }, 602 | }); 603 | globals.config.mockReturnValue({ blockList: ['foo'] }); 604 | 605 | const actual = await module.getVideosFromPlaylistPage(input); 606 | 607 | expect(actual).toHaveLength(1); 608 | expect(actual[0]).toHaveProperty('video.id', 'bar'); 609 | }); 610 | 611 | it('should not discard videos if no blocklist', async () => { 612 | const input = { 613 | items: [ 614 | { 615 | contentDetails: { videoId: 'foo' }, 616 | snippet: { position: 42 }, 617 | }, 618 | ], 619 | }; 620 | mock('getVideosData', { 621 | foo: { 622 | video: { id: 'foo', title: 'foo bar' }, 623 | }, 624 | }); 625 | globals.config.mockReturnValue({ blockList: null }); 626 | 627 | const actual = await module.getVideosFromPlaylistPage(input); 628 | 629 | expect(actual).toHaveLength(1); 630 | expect(actual[0]).toHaveProperty('video.id', 'foo'); 631 | }); 632 | 633 | it('should discard videos with no details', async () => { 634 | const input = { 635 | items: [ 636 | { 637 | contentDetails: { videoId: 'foo' }, 638 | snippet: { position: 42 }, 639 | }, 640 | { 641 | contentDetails: { videoId: 'bar' }, 642 | snippet: { position: 43 }, 643 | }, 644 | ], 645 | }; 646 | mock('getVideosData', { 647 | foo: { 648 | video: { id: 'foo', title: 'foo bar' }, 649 | }, 650 | }); 651 | 652 | const actual = await module.getVideosFromPlaylistPage(input); 653 | 654 | expect(actual).toHaveLength(1); 655 | expect(actual[0]).toHaveProperty('video.id', 'foo'); 656 | expect(actual[0]).toHaveProperty('video.positionInPlaylist', 42); 657 | expect(actual[0]).toHaveProperty('video.title', 'foo bar'); 658 | }); 659 | 660 | it('should warn about videos without data', async () => { 661 | const input = { 662 | items: [ 663 | { 664 | contentDetails: { videoId: 'foo', customInfo: 'bar' }, 665 | }, 666 | { 667 | contentDetails: { videoId: 'bar' }, 668 | }, 669 | ], 670 | }; 671 | mock('getVideosData', { 672 | bar: { 673 | video: { id: 'bar' }, 674 | }, 675 | }); 676 | 677 | await module.getVideosFromPlaylistPage(input); 678 | 679 | expect(pulse.emit).toHaveBeenCalledWith('warning', anyString, [ 680 | 'https://youtu.be/foo', 681 | ]); 682 | }); 683 | 684 | it('should warn about videos added several times to the playlist', async () => { 685 | const input = { 686 | items: [ 687 | { 688 | contentDetails: { videoId: 'foo' }, 689 | }, 690 | { 691 | contentDetails: { videoId: 'foo' }, 692 | }, 693 | ], 694 | }; 695 | 696 | await module.getVideosFromPlaylistPage(input); 697 | 698 | expect(pulse.emit).toHaveBeenCalledWith('warning', anyString, anyString); 699 | }); 700 | }); 701 | }); 702 | --------------------------------------------------------------------------------