├── README.md ├── character_list5.csv ├── character_mapping.csv ├── meta_data7.csv ├── movieObj.js └── script-download.js /README.md: -------------------------------------------------------------------------------- 1 | # Polygraph's Film Dialogue Dataset 2 | 3 | 04/12/2016 - just pushed a major update of roughly 200 films based on reader feedback. We also decided to remove several datasets that provided additional metadata that wasn't published in the article. :( 4 | 5 | Note: I am correcting the csv data as people find errors in our character mapping or omitted characters. Sorry if you end up forking an old data set. 6 | 7 | A previous version presented the data as "lines." This turned out to be a very ambiguous word. In reality, we had compiled total number of words, by character, and then converted them to lines using an average of 10 words per line. This is creating more confusion than needed, so we're moving back to just words, which is what is currently in the CSV data to begin with. The minute-by-minute data, however, is still based on lines (i.e., a row of dialogue text). 8 | 9 | character_list5.csv - this is the data that powers all of the calculations on polygraph.cool/films. It uses the most accurate script that we can find for a given film. People are understandably finding errors, so we will be updating this file as much as possible. 10 | 11 | meta_data7.csv - this is unique list of IMDB_IDs from the character_list file, with additional meta data, such as release year and domestic, inflation-adjusted gross. 12 | 13 | The selected scripts and their sources are also publicly maintained here: https://docs.google.com/spreadsheets/d/1fbcldxxyRvHjDaaY0EeQnQzvSP7Ub8QYVM2bIs-tKH8/edit#gid=1668340193 14 | 15 | To parse the line data in meta_data7.csv: we assume that a minute of dialogue is roughly 14 lines (using average speaking pace 140 words/min. and average words per line of about 10). 16 | 17 | So each numeral in the string is the number of MALE lines for half a minute. So if split up the string into groups of two and add the two the numerals, we have total number of male lines of roughly a minute of time. 18 | 19 | Here's the js code from the article that we use to parse that string: 20 | 21 | var lineInfo = data.lines_data.match(/.{1,2}/g); 22 | 23 | for (line in lineInfo){ 24 | var minuteTotal = +lineInfo[line].slice(0,1) + +lineInfo[line].slice(1,2); 25 | var row = [minuteTotal,14-minuteTotal]; 26 | lineData.push(row); 27 | } 28 | 29 | Each row is an array of [male lines out of 14 representing one minute, female lines out of 14 representing one minute] 30 | 31 | -------------------------------------------------------------------------------- /character_list5.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matthewfdaniels/scripts/36ad4db9ac3619c0934402470b9c73d317c306fa/character_list5.csv -------------------------------------------------------------------------------- /character_mapping.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matthewfdaniels/scripts/36ad4db9ac3619c0934402470b9c73d317c306fa/character_mapping.csv -------------------------------------------------------------------------------- /meta_data7.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matthewfdaniels/scripts/36ad4db9ac3619c0934402470b9c73d317c306fa/meta_data7.csv -------------------------------------------------------------------------------- /script-download.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs"); 2 | const async = require("async"); 3 | const request = require("request"); 4 | const jsdom = require("jsdom").jsdom; 5 | const fountain = require('fountain-js'); 6 | const cheerio = require('cheerio'); 7 | const util = require("util"); 8 | const moment = require("moment"); 9 | const exec = require('child_process').exec; 10 | //const movieObj = require(`${__dirname}/../../data/js/movieObj`); 11 | const movieObj = require(`./movieObj`); 12 | 13 | 14 | var startTime = moment(); 15 | 16 | var downloadedScrapeIds = []; 17 | var pendingScrapeIds = []; 18 | 19 | var failedDownloads = []; 20 | var newFails = "scrape_id\n"; 21 | 22 | var html_paths = { 23 | "http://www.imsdb.com": { 24 | path: "#mainbody > table:nth-child(3) > tbody > tr > td:nth-child(3) > table > tbody > tr > td > pre" 25 | }, 26 | "http://www.lynchnet.com": { 27 | path: "body > pre" 28 | }, 29 | "http://www.pages.drexel.edu/~ina22/splaylib": { 30 | path: "body > x-claris-window > x-claris-tagview > pre" 31 | }, 32 | "http://leonscripts.": { 33 | path: "body > pre" 34 | }, 35 | "http://www.aellea.com": { 36 | path: "body > pre" 37 | }, 38 | "http://www.dailyscript.com": { 39 | path: "body > pre > blockquote > blockquote" 40 | }, 41 | "http://www.horrorlair.com": { 42 | path: "body > div.Section1" 43 | }, 44 | "http://www.pages.drexel.edu": { 45 | path: "body > x-claris-window > x-claris-remotesave > x-claris-tagview > pre" 46 | }, 47 | "http://www.scifiscripts.com": { 48 | path: "body > pre > font" 49 | }, 50 | "http://www.theneitherworld.com": { 51 | path: "body > pre" 52 | } 53 | }; 54 | 55 | function cleanURL(url) { 56 | 57 | if(url.startsWith("http://www.imsdb.com/Movie Scripts/")){ 58 | url = url.replace("http://www.imsdb.com/Movie Scripts/", "http://www.imsdb.com/scripts/"); 59 | url = `${url.slice(0, -12).replace(/ /g, "-")}.html`; 60 | url = url.replace("-.html", ".html"); 61 | } 62 | 63 | if(url.startsWith("http://www.sellingyourscreenplay.com/script-library/")){ 64 | url = `${url}.html`; 65 | } 66 | 67 | url = url.replace("www.awesomefilm.comscript", "www.awesomefilm.com/script") 68 | 69 | return url; 70 | } 71 | 72 | async.series({ 73 | // getFailedDownloadList: (cb) => { 74 | 75 | 76 | // fs.readFile(`${__dirname}/../../data/tsv/failed-downloads.tsv`, "utf8", (err, data) => { 77 | 78 | // var rows = data.split("\n").slice(1); 79 | // async.forEachSeries(rows, (row, cb1) => { 80 | 81 | // if(row.length > 0){ 82 | // var parts = row.split("\t"); 83 | // if(failedDownloads.indexOf(parts[0].toString()) === -1){ 84 | // failedDownloads.push(parts[0].toString()); 85 | // } 86 | 87 | // } 88 | // async.setImmediate(() => { cb1(); }); 89 | // }, () => { 90 | // console.log(`Failed downloads:\t\t${failedDownloads.length}`) 91 | 92 | // cb(); 93 | // }) 94 | 95 | // }); 96 | // }, 97 | getListOfDownloadedScripts: (cb) => { 98 | 99 | var folders = ["pdf", "text"]; 100 | 101 | async.forEachSeries(folders, (folder, cb1) => { 102 | var downloadDir = `${__dirname}/../../data/script_downloads/${folder}/`; 103 | 104 | fs.readdir(downloadDir, (err, files) => { 105 | 106 | async.forEachSeries(files, (file, cb2) => { 107 | 108 | var scrape_id = file.replace("scrape-", "") 109 | .replace(".pdf", "") 110 | .replace(".txt", ""); 111 | 112 | if(downloadedScrapeIds.indexOf(scrape_id.toString()) === -1) { 113 | downloadedScrapeIds.push(scrape_id.toString()); 114 | } 115 | async.setImmediate(() => { cb2(); }); 116 | 117 | }, () => { 118 | async.setImmediate(() => { cb1(); }); 119 | 120 | }); 121 | }); 122 | 123 | }, () => { 124 | 125 | async.forEachSeries(Object.keys(movieObj), (scrape_id, cb1) => { 126 | 127 | if(downloadedScrapeIds.indexOf(scrape_id.toString()) === -1 && 128 | pendingScrapeIds.indexOf(scrape_id.toString()) === -1 && 129 | failedDownloads.indexOf(scrape_id.toString()) === -1){ 130 | 131 | pendingScrapeIds.push(scrape_id.toString()) 132 | } 133 | async.setImmediate(() => { cb1() }); 134 | 135 | }, () => { 136 | console.log(`Total URLS (removing ignore):\t${downloadedScrapeIds.length + pendingScrapeIds.length}`) 137 | console.log(`Downloaded scripts:\t\t${downloadedScrapeIds.length}`) 138 | console.log(`Pending (nonfailed) scripts:\t${pendingScrapeIds.length}`) 139 | cb(); 140 | }) 141 | }); 142 | }, 143 | iterateMovies: (cb) => { 144 | 145 | var index = 1; 146 | 147 | // pendingScrapeIds = [2155, 3334, 3421, 3423, 3436, 4628, 4629, 5237, 6363, 6419, 6914, 7390, 7470, 7532, 7569, 7619, 8047, 8638, 8652, 8833]; 148 | 149 | 150 | async.forEachSeries(pendingScrapeIds, (scrape_id, cb1) => { 151 | 152 | scrape_id = scrape_id.toString(); 153 | 154 | var movie = movieObj[scrape_id]; 155 | 156 | console.log("\n------------------") 157 | console.log(`${index}/${pendingScrapeIds.length}`) 158 | console.log(`Scrape id:\t${scrape_id}`) 159 | console.log(movie) 160 | // if(downloadedScrapeIds.indexOf(scrape_id) !== -1 || 161 | // failedDownloads.indexOf(scrape_id) !== -1){ 162 | // console.log("EXISTS") 163 | // async.setImmediate(() => { cb1(); }) 164 | 165 | // } else 166 | if(typeof movie === "undefined"){ 167 | console.log("IGNORE") 168 | async.setImmediate(() => { cb1(); }) 169 | } else { 170 | 171 | //console.log(movieObj) 172 | var url = cleanURL(movie.link); 173 | 174 | console.log(url); 175 | console.log(movie.source) 176 | index++; 177 | 178 | if (movie.source === "imsdb") { 179 | fs.readFile(`${__dirname}/../data/${movie.link}`, "utf8", (err, data) => { 180 | if(err){ 181 | async.setImmediate(() => { cb1(); }); 182 | } else { 183 | var $ = cheerio.load(data); 184 | var selector = html_paths["http://www.imsdb.com"].path; 185 | 186 | console.log($(selector).length) 187 | if($(selector).length > 0){ 188 | fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $(selector).text()); 189 | } 190 | async.setImmediate(() => { cb1(); }); 191 | } 192 | 193 | }) 194 | } else if (movie.source === "scriptdrive") { 195 | fs.readFile(`${__dirname}/../../data/scriptdrive/${movie.link}`, (err, data) => { 196 | if(err){ 197 | console.log(err) 198 | async.setImmediate(() => { cb1(); }); 199 | } else { 200 | 201 | console.log(`${movie.link}`) 202 | fs.writeFile(`${__dirname}/../../data/script_downloads/pdf/scrape-${scrape_id}.pdf`, data); 203 | async.setImmediate(() => { cb1(); }); 204 | } 205 | 206 | }) 207 | } else if (movie.source === "manual"){ 208 | 209 | 210 | // if(url.indexOf("imsdb") !== -1){ 211 | // //var fileName = url.replace("http://www.imsdb.com/scripts/", "") 212 | // //console.log(`${__dirname}/../data/film_20100519/all_imsdb_05_19_10/${fileName}`) 213 | // //fs.readFile(`${__dirname}/../data/film_20100519/all_imsdb_05_19_10/${fileName}`, "utf8", (err, data) => { 214 | // request(url, (err, resp, data) => { 215 | // if(err){ 216 | // async.setImmediate(() => { cb1(); }); 217 | // } else { 218 | // var $ = cheerio.load(data); 219 | // //var selector = html_paths["http://www.imsdb.com"].path; 220 | // var selector = ".scrtext"; 221 | 222 | // //$("#mainbody > table:nth-child(3) > tbody > tr > td:nth-child(3) > table.script-details > tbody > tr:nth-child(2) > td:nth-child(2) a:last-of-type") 223 | 224 | 225 | // console.log($(selector).length) 226 | // if($(selector).length > 0){ 227 | // fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $(selector).text()); 228 | // } 229 | // async.setImmediate(() => { cb1(); }); 230 | // } 231 | 232 | // }) 233 | 234 | // } 235 | //else { 236 | 237 | 238 | request.get({url: url, encoding: "binary", timeout: 120000}, (err, resp, body) => { 239 | 240 | if(err){ 241 | console.log(err); 242 | newFails+= `${scrape_id}\n`; 243 | async.setImmediate(() => { cb1(); }) 244 | } else { 245 | if(typeof resp !== "undefined" || resp.statusCode !== 404) { 246 | console.log(resp.caseless.dict['content-type']) 247 | 248 | if (resp.caseless.dict['content-type'] === "text/plain") { 249 | 250 | console.log("TEXT") 251 | fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, resp.body); 252 | async.setImmediate(() => { cb1(); }); 253 | 254 | } else if (resp.caseless.dict['content-type'] === "application/pdf") { 255 | 256 | console.log("PDF") 257 | fs.writeFile(`${__dirname}/../../data/script_downloads/pdf/scrape-${scrape_id}.pdf`, body, 'binary'); 258 | async.setImmediate(() => { cb1(); }); 259 | 260 | } else if (resp.caseless.dict['content-type'] === "text/html" || resp.caseless.dict['content-type'].toLowerCase() === "text/html; charset=utf-8") { 261 | 262 | console.log("HTML") 263 | var matchedUrl = null; 264 | for(var p in html_paths){ 265 | if(url.toLowerCase().startsWith(p.toLowerCase())){ 266 | matchedUrl = p; 267 | break; 268 | } 269 | } 270 | 271 | var $ = cheerio.load(body); 272 | 273 | if(matchedUrl){ 274 | 275 | // var pageType = html_paths[matchedUrl]; 276 | // var selector = "body > pre"; 277 | 278 | 279 | fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $(body).text()); 280 | async.setImmediate(() => { cb1(); }) 281 | // if(pageType.path){ 282 | // selector = pageType.path; 283 | // } 284 | // console.log($(selector).text()) 285 | // console.log(`html length: ${$(selector).length}`) 286 | 287 | // //if($(selector).length > 0){ 288 | // if(url.indexOf("dailyscript") !== -1){ 289 | 290 | // fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $("body > div.container > div:nth-child(3) > div > pre").text()); 291 | // async.setImmediate(() => { cb1(); }) 292 | 293 | // var href = $("a[href^=scripts]").attr("href") 294 | // console.log(href) 295 | // if(typeof href !== "undefined"){ 296 | // request.get({url: `http://www.horrorlair.com/movies/${href}`, encoding: "binary"}, (err1, resp1, body1) => { 297 | 298 | // console.log(`http://www.horrorlair.com/${href}`) 299 | // console.log( ) 300 | // console.log(resp1.caseless.dict['content-type']) 301 | // console.log(resp1.caseless.dict['content-type'] === "application/pdf") 302 | // console.log( ) 303 | 304 | // if (["text/plain", "text/html"].indexOf(resp1.caseless.dict['content-type']) !== -1 ) { 305 | // console.log("HTML HERE") 306 | // var $1 = cheerio.load(body1); 307 | // fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $1(body1).text()); 308 | 309 | // } else if (resp1.caseless.dict['content-type'] === "application/pdf") { 310 | // console.log("PDF HERE") 311 | // fs.writeFile(`${__dirname}/../../data/script_downloads/pdf/scrape-${scrape_id}.pdf`, body1, "binary"); 312 | 313 | // } 314 | 315 | // async.setImmediate(() => { cb1(); }) 316 | 317 | // }); 318 | // } else { 319 | // async.setImmediate(() => { cb1(); }) 320 | // } 321 | 322 | // } 323 | 324 | 325 | } else { 326 | console.log("NO MATCH") 327 | fs.writeFile(`${__dirname}/../../data/script_downloads/text/scrape-${scrape_id}.txt`, $(body).text()); 328 | 329 | async.setImmediate(() => { cb1(); }) 330 | } 331 | 332 | } else { 333 | newFails+= `${scrape_id}\n`; 334 | async.setImmediate(() => { cb1(); }) 335 | } 336 | } else { 337 | async.setImmediate(() => { cb1(); }) 338 | } 339 | } 340 | }); 341 | 342 | 343 | } else { 344 | console.log("BAD SOURCE") 345 | async.setImmediate(() => { cb1(); }) 346 | } 347 | } 348 | 349 | }, () => { 350 | fs.writeFile(`${__dirname}/../../data/dead-links.tsv`, newFails) 351 | cb(); 352 | }) 353 | }, 354 | turnPDFsToText: (cb) => { 355 | 356 | var downloadDir = `${__dirname}/../../data/script_downloads`; 357 | //var pdfDirectory = `${downloadDir}/pdf/`; 358 | var pdfDirectory = `${downloadDir}/ocr_output/`; 359 | 360 | fs.readdir(pdfDirectory, (err, pdfs) => { 361 | 362 | var failedPDFScans = ""; 363 | var index = 1; 364 | // pdfs = [1504, 1506, 1518, 1521, 1552, 1553, 1624, 1649, 1660, 1678, 1694, 1713, 1719, 1720, 1726, 365 | // 1787, 1799, 1819, 1830, 1842, 1869, 1905, 1908, 1932, 1935, 1947, 1958, 1959, 1969, 1984, 2037, 2048, 366 | // 2060, 2062, 2078, 2080, 2090, 2097, 2134, 2149, 2155, 2176, 2185, 2241, 2242, 2257, 2295, 2323, 2342, 367 | // 2351, 2354, 2383, 2414, 2415, 2437, 2448, 2471, 2491, 2493, 2577, 2595, 2623, 2636, 2646, 2674, 2678, 368 | // 2711, 2715, 2725, 2756, 2798, 2817, 2839, 2845, 2853, 2890, 2911, 2926, 2940, 2948, 2960, 2979, 3003, 369 | // 3011, 3012, 3013, 3025, 3061, 3063, 3154, 3183, 3198, 3200, 3246, 3255, 3266, 3273, 3286, 3303, 3312, 370 | // 3322, 3334, 3367, 3369, 3379, 3404, 3421, 3423, 3425, 3428, 3429, 3431, 3436, 3474, 3475, 3481, 3510, 371 | // 3524, 3537, 3628, 3681, 3692, 3698, 3710, 3711, 3747, 3767, 3805, 3825, 3841, 3849, 3853, 3857, 3878, 372 | // 3881, 3882, 3890, 3891, 3912, 3921, 3928, 3930, 3935, 3940, 3943, 3961, 3969, 3978, 3990, 3995, 4004, 373 | // 4007, 4055, 4066, 4068, 4070, 4085, 4086, 4091, 4095, 4104, 4105, 4108, 4109, 4128, 4130, 4141, 4157, 374 | // 4165, 4180, 4192, 4197, 4208, 4219, 4223, 4229, 4230, 4231, 4244, 4245, 4246, 4249, 4255, 4268, 4269, 375 | // 4284, 4295, 4297, 4302, 4307, 4320, 4380, 4387, 4391, 4398, 4400, 4423, 4432, 4434, 4435, 4436, 4439, 376 | // 4466, 4469, 4473, 4476, 4478, 4479, 4482, 4504, 4509, 4521, 4534, 4541, 4544, 4545, 4549, 4554, 4557, 377 | // 4558, 4562, 4564, 4568, 4574, 4583, 4595, 4613, 4621, 4628, 4629, 4633, 4640, 4646, 4664, 4669, 4675, 378 | // 4692, 4693, 4707, 4712, 4713, 4723, 4735, 4736, 4737, 4740, 4742, 4747, 4764, 4776, 4784, 4802, 4822, 379 | // 4827, 4857, 4862, 4900, 4915, 4916, 4952, 4958, 4982, 4987, 4989, 4996, 5000, 5012, 5019, 5058, 5065, 380 | // 5070, 5092, 5107, 5117, 5126, 5129, 5151, 5152, 5175, 5176, 5190, 5192, 5195, 5196, 5200, 5202, 5203, 381 | // 5204, 5207, 5217, 5230, 5237, 5247, 5273, 5308, 5313, 5359, 5486, 5510, 5520, 5521, 5524, 5525, 5526, 382 | // 5527, 5528, 5530, 5534, 5800, 5805, 5824, 5864, 5872, 5886, 5895, 5906, 5912, 5937, 5948, 5954, 5955, 383 | // 5962, 5963, 5966, 5992, 5999, 6003, 6019, 6022, 6035, 6048, 6049, 6052, 6059, 6062, 6067, 6072, 6125, 384 | // 6132, 6134, 6146, 6150, 6152, 6189, 6194, 6206, 6233, 6238, 6240, 6264, 6268, 6289, 6305, 6308, 6336, 385 | // 6337, 6339, 6350, 6355, 6363, 6377, 6378, 6379, 6390, 6402, 6403, 6408, 6419, 6435, 6442, 6463, 6470, 386 | // 6471, 6491, 6503, 6514, 6536, 6563, 6565, 6567, 6571, 6581, 6596, 6619, 6626, 6637, 6642, 6662, 6700, 387 | // 6715, 6732, 6735, 6768, 6811, 6819, 6821, 6835, 6861, 6914, 6991, 7001, 7017, 7025, 7049, 7063, 7080, 388 | // 7216, 7217, 7219, 7225, 7250, 7259, 7276, 7290, 7291, 7296, 7311, 7313, 7324, 7330, 7331, 7360, 7385, 389 | // 7386, 7390, 7392, 7421, 7452, 7456, 7470, 7489, 7490, 7501, 7516, 7531, 7532, 7533, 7537, 7540, 7542, 390 | // 7562, 7563, 7564, 7569, 7599, 7614, 7618, 7619, 7689, 7697, 7707, 7762, 7793, 7828, 7851, 7856, 7870, 391 | // 7899, 7924, 7927, 7928, 7966, 7977, 7987, 8031, 8032, 8038, 8043, 8044, 8047, 8050, 8054, 8064, 8070, 392 | // 8080, 8114, 8144, 8153, 8157, 8179, 8194, 8205, 8208, 8216, 8244, 8267, 8269, 8285, 8288, 8304, 8309, 393 | // 8349, 8351, 8359, 8387, 8388, 8414, 8431, 8474, 8481, 8529, 8550, 8552, 8566, 8598, 8604, 8629, 8632, 394 | // 8634, 8638, 8648, 8652, 8653, 8654, 8656, 8657, 8658, 8659, 8703, 8704, 8711, 8712, 8714, 8717, 8725, 395 | // 8735, 8737, 8743, 8746, 8756, 8775, 8784, 8790, 8795, 8802, 8818, 8819, 8833, 8840, 8843, 8845, 8846, 396 | // 8847, 8868, 8878, 8888, 8893, 8907, 8929, 8934, 8935, 8941, 8946, 8954, 8956, 8967, 8980, 8985, 8988, 397 | // 8989, 9001, 9034, 9040, 9042, 9048, 9049, 9084, 9086, 9092, 9115, 9116, 9125, 9128, 9130, 9131, 9135, 398 | // 9147, 9149, 9160, 9163, 9173, 9177, 9186, 9198, 9213, 9223, 9232, 9243, 9245].map((scrape_id) => { 399 | // return `scrape-${scrape_id}.pdf` 400 | // }); 401 | 402 | async.forEachSeries(pdfs, (pdf, cb1) => { 403 | 404 | if(pdf !== ".DS_Store"){ 405 | var fileName = pdf.slice(0, -4); 406 | var inputPDF = `${pdfDirectory}${pdf}`; 407 | var outputTxt = `${downloadDir}/text/${fileName}.txt` 408 | 409 | index++; 410 | 411 | //console.log(outputTxt) 412 | fs.exists(inputPDF, (exists) => { 413 | //if(true){ 414 | if(exists) { 415 | 416 | console.log("--------------------------") 417 | console.log(fileName) 418 | console.log(`${index}/${pdfs.length}`) 419 | 420 | async.series({ 421 | removePermissions: (cb2) => { 422 | 423 | console.log("removing passwords") 424 | exec(`mv ${inputPDF} ${pdfDirectory}temp.pdf; qpdf --decrypt ${pdfDirectory}temp.pdf ${inputPDF}; rm ${pdfDirectory}temp.pdf`, 425 | (err, stdout, stderr) => { 426 | if(err || stderr){ 427 | console.log(err) 428 | console.log(stderr) 429 | } 430 | async.setImmediate(() => { cb2(); }) 431 | }); 432 | 433 | }, 434 | textExtract: (cb2) => { 435 | 436 | console.log("extracting text") 437 | exec(`python ${__dirname}/../data/pdfminer-20140328/build/scripts-2.7/pdf2txt.py ${inputPDF}`, 438 | { maxBuffer: 1024 * 1000 }, 439 | (err, stdout, stderr) => { 440 | if(err || stderr){ 441 | console.log(err) 442 | console.log(stderr) 443 | failedPDFScans += `${pdf}\n` 444 | } else { 445 | fs.writeFile(outputTxt, stdout) 446 | } 447 | async.setImmediate(() => { cb2(); }) 448 | }); 449 | 450 | }, 451 | done: () => { 452 | async.setImmediate(() => { cb1(); }) 453 | } 454 | }) 455 | 456 | 457 | } else { 458 | async.setImmediate(() => { cb1(); }) 459 | } 460 | }) 461 | 462 | } else { 463 | async.setImmediate(() => { cb1(); }); 464 | } 465 | 466 | }, () => { 467 | console.log(failedPDFScans) 468 | cb(); 469 | }) 470 | 471 | }) 472 | 473 | }, 474 | done: () => { 475 | console.log(moment().diff(startTime, "minutes")) 476 | console.log("Finished: script-download.js") 477 | // callback(); 478 | } 479 | }); 480 | 481 | 482 | --------------------------------------------------------------------------------