├── .eslintrc.json ├── .editorconfig ├── example.js ├── .gitignore ├── package.json ├── test.js ├── README.md └── index.js /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["plugin:prettier/recommended"], 3 | "plugins": ["prettier"], 4 | "env": { 5 | "node": true 6 | }, 7 | "rules": { 8 | "prettier/prettier": "error" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | charset = utf-8 6 | end_of_line = lf 7 | insert_final_newline = true 8 | trim_trailing_whitespace = true 9 | 10 | [*.{js}] 11 | indent_size = 4 12 | indent_style = space 13 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | let ig = require("./index"); 2 | 3 | ig.scrapeTag('veranda').then(result => { 4 | console.dir(result); 5 | }); 6 | ig.scrapeComment('CPHnIGbBh1k').then(result => { 7 | console.dir(result); 8 | }); 9 | ig.scrapeUserPage('jcvrnd19').then(result => { 10 | console.dir(result); 11 | }); -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | /dist 4 | package-lock.json 5 | 6 | # local env files 7 | .env.local 8 | .env.*.local 9 | 10 | # Log files 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | 15 | # Editor directories and files 16 | .idea 17 | .vscode 18 | *.suo 19 | *.ntvs* 20 | *.njsproj 21 | *.sln 22 | *.sw? 23 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "instagram-scraping", 3 | "version": "1.0.17", 4 | "description": "NPM module for loading media by hashtag without instagram API", 5 | "main": "index.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "git+https://github.com/rzlyp/instagram-scraping.git" 9 | }, 10 | "scripts": { 11 | "test": "mocha -R spec" 12 | }, 13 | "keywords": [ 14 | "instagram", 15 | "hashtag", 16 | "scraping", 17 | "scrape" 18 | ], 19 | "author": "Rizal Yogi Pratama <@rzlyp>", 20 | "license": "MIT", 21 | "bugs": { 22 | "url": "https://github.com/rzlyp/instagram-scraping/issues" 23 | }, 24 | "homepage": "https://github.com/rzlyp/instagram-scraping#readme", 25 | "devDependencies": { 26 | "chai": "3.5.0", 27 | "eslint": "^6.4.0", 28 | "eslint-config-prettier": "^6.3.0", 29 | "eslint-plugin-prettier": "^3.1.1", 30 | "mocha": "3.1.2", 31 | "nock": "9.0.2", 32 | "prettier": "1.18.2" 33 | }, 34 | "dependencies": { 35 | "async": "^2.6.0", 36 | "axios": "^0.21.1", 37 | "bluebird": "^3.5.1" 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | var mocha = require("mocha"), 2 | assert = require("chai").assert, 3 | ig = require("./index"); 4 | 5 | var nock = require("nock"); 6 | var api = nock("https://www.instagram.com") 7 | .persist() 8 | .get("/explore/tags/nrkvalg") 9 | .replyWithFile(200, __dirname + "/fixtures/tagPage.html") 10 | .get(/\/p\/\w+/) 11 | .replyWithFile(200, __dirname + "/fixtures/postPage.html") 12 | .get(/\/explore\/locations\/\d+/) 13 | .replyWithFile(200, __dirname + "/fixtures/locationPage.html"); 14 | 15 | describe("instagram-scraper", function() { 16 | it("should throw error when called with missing tag argument", function(done) { 17 | ig.scrapeTag() 18 | .then(function(result) { 19 | assert.fail("Promise should be rejected"); 20 | done(); 21 | }) 22 | .catch(function(err) { 23 | assert.typeOf(err, "error"); 24 | done(); 25 | }); 26 | }); 27 | 28 | it("should return object containing count, total and media", function(done) { 29 | ig.scrapeTag("veranda").then(function(result) { 30 | assert.isAtLeast(result.count, 1); 31 | assert.isAtLeast(result.total, 1); 32 | assert.equal(result.media.length, result.count); 33 | done(); 34 | }); 35 | }); 36 | 37 | it("should throw error when called with missing code argument", function(done) { 38 | ig.scrapeTag() 39 | .then(function(result) { 40 | assert.fail("Promise should be rejected"); 41 | done(); 42 | }) 43 | .catch(function(err) { 44 | assert.typeOf(err, "error"); 45 | done(); 46 | }); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # instagram-scraping 2 | 3 | [![npm downloads](https://img.shields.io/npm/dt/instagram-scraping.svg)](https://npm.im/instagram-scraping) 4 | 5 | > NodeJS module for loading posts from Instagram by hashtag without API access by means of scraping. 6 | 7 | ## Disclamer 8 | 9 | Instagram has gone to great lengths to prevent scraping and other unauthorized access to their public content. This module is dependant on the markup the public-facing instagram.com. Should that change this module might also stop working as intended. It also only loads the 12 posts that are displayed on first-load without following pagination to load more images. You should take this into consideration when deciding whether this module will work for you. 10 | 11 | ## Cloud Proxy support 12 | To avoid many scraping errors (`302 redirect to login`, and `429 Rate Limited`) this library now has cloud proxy support (free and paid plans are available, depending on your volume). Subscribe to https://rapidapi.com/neotank/api/instagram130 which is a proxy for Instagram, and specify your RAPIDAPI_KEY to the env variable when launching your scraping script: `RAPIDAPI_KEY=your-key node index.js` - this will route all requests to Instagram through proxies with retries and response quality checks under the hood. 13 | 14 | ## Installation 15 | 16 | `npm install instagram-scraping` 17 | 18 | ## Usage 19 | 20 | There are some limitation of loading instagram data, but enjoy it. i hope it's will help you. 21 | 22 | ### Tag Scraping Media 23 | 24 | ```javascript 25 | var ig = require('instagram-scraping'); 26 | 27 | ig.scrapeTag('veranda').then((result) => { 28 | console.dir(result); 29 | }); 30 | ``` 31 | 32 | Example response: 33 | 34 | ```json 35 | { 36 | "total": 54, 37 | "medias": [ 38 | { 39 | "media_id": "1684684359967334824", 40 | "shortcode": "CPHnIGbBh1k", 41 | "text": "Selamat siang komuni!🙋 Sportakular hadir lagi untuk mengawali 2018 kita ini dengan penuh semangat dan kebersamaan, berikut jadwal-jadwalnya : sportakular Voly Kamis,4 Januari 2018 18.00 sd selesai Lap.telkom pinggir monumen Sportakular Futsal Jumat , 5 Januari 2018 17.30-20.00 Lap. Meteor Sportakular Badminton Sabtu,6 Januari2018 19.00-21.00 Lap.Pdam (pinggir ITB) Dicatet ya setiap jadwal kegiatannya, biar tidak terlewatkan karena sayang banget untuk dilewatkan. 😉 dan untuk cabang olahraga lain bakalan mimin share lagi so stay tuned dan selalu ingat: 'Berpartisipasi = Auto Kece😎😎' salam olahraga! #himaik #Ikberaniberkarya #salamsatuik #menujuIKsehat #unikom #sportakular", 42 | "comment_count": { 43 | "count": 0 44 | }, 45 | "like_count": { 46 | "count": 10 47 | }, 48 | "display_url": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/e35/25024357_207155156521690_1744670180115480576_n.jpg?se=7", 49 | "owner_id": "1648294943", 50 | "date": 1515050047, 51 | "thumbnail": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s640x640/sh0.08/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg", 52 | "thumbnail_resource": [ 53 | { 54 | "src": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s150x150/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg", 55 | "config_width": 150, 56 | "config_height": 150 57 | }, 58 | { 59 | "src": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s240x240/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg", 60 | "config_width": 240, 61 | "config_height": 240 62 | } 63 | ] 64 | } 65 | ] 66 | } 67 | ``` 68 | ### Scrape Comment 69 | 70 | ```javascript 71 | // using shortcode for scraping comment 72 | ig.scrapeComment('CPHnIGbBh1k').then((result) => { 73 | console.dir(result); 74 | }); 75 | ``` 76 | ### Deep Tag Scraping 77 | 78 | ```javascript 79 | ig.deepScrapeTagPage('veranda').then((result) => { 80 | console.dir(result); 81 | }); 82 | ``` 83 | 84 | ### Scrape User Page 85 | 86 | ```javascript 87 | // using username for scraping 88 | ig.scrapeUserPage('jscmila').then((result) => { 89 | console.dir(result); 90 | }); 91 | ``` 92 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var axios = require('axios'), 2 | Promise = require('bluebird'), 3 | async = require('async'); 4 | 5 | var rapidApiMode = !!process.env.RAPIDAPI_KEY, 6 | rapidApiURL = 'https://instagram130.p.rapidapi.com/proxy'; 7 | 8 | var userURL = 'https://www.instagram.com/', 9 | listURL = 'https://www.instagram.com/explore/tags/', 10 | postURL = 'https://www.instagram.com/p/', 11 | locURL = 'https://www.instagram.com/explore/locations/', 12 | dataExp = /window\._sharedData\s?=\s?({.+);<\/script>/; 13 | 14 | var headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'} 15 | 16 | if (rapidApiMode) { 17 | headers['x-rapidapi-key'] = process.env.RAPIDAPI_KEY 18 | } 19 | 20 | var proxifyURL = function (url) { 21 | if (!rapidApiMode) return url; 22 | 23 | return rapidApiURL + '?url=' + encodeURIComponent(url); 24 | } 25 | 26 | exports.scrapeUserPage = function (username) { 27 | return new Promise(function (resolve, reject) { 28 | if (!username) return reject(new Error('Argument "username" must be specified')); 29 | 30 | axios.get(proxifyURL(userURL + username), { headers }).then((result) => { 31 | var data = scrape(result.data); 32 | if (data && data.entry_data && 33 | data.entry_data.ProfilePage && 34 | data.entry_data.ProfilePage[0] && 35 | data.entry_data.ProfilePage[0].graphql && 36 | data.entry_data.ProfilePage[0].graphql.user && 37 | data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media && 38 | data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.count > 0 && data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges) { 39 | var edges = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges; 40 | async.waterfall([ 41 | (callback) => { 42 | var medias = []; 43 | edges.forEach((post) => { 44 | if (post.node.__typename === 'GraphImage'||post.node.__typename === 'GraphSidecar' || post.node.__typename === 'GraphVideo') { 45 | medias.push(exports.scrapePostData(post)) 46 | } 47 | }); 48 | callback(null, medias); 49 | } 50 | ], (err, results) => { 51 | resolve({ 52 | total: results.length, 53 | medias: results, 54 | user: data.entry_data.ProfilePage[0].graphql.user 55 | }) 56 | }) 57 | } 58 | else { 59 | reject(new Error('Error scraping user page "' + username + '"')); 60 | } 61 | }).catch((err) => { 62 | reject(new Error('Error scraping user page "' + username + '"')); 63 | }); 64 | }); 65 | }; 66 | 67 | exports.deepScrapeTagPage = function (tag) { 68 | return new Promise(function (resolve, reject) { 69 | exports.scrapeTag(tag).then(function (tagPage) { 70 | return Promise.map(tagPage.medias, function (media, i, len) { 71 | return exports.scrapePostCode(media.node.shortcode).then(function (postPage) { 72 | tagPage.medias[i] = postPage; 73 | if (postPage.location && postPage.location.has_public_page) { 74 | return exports.scrapeLocation(postPage.location.id).then(function (locationPage) { 75 | tagPage.media[i].location = locationPage; 76 | }) 77 | .catch(function (err) { 78 | console.log("An error occurred calling scrapeLocation inside deepScrapeTagPage" + ":" + err); 79 | }); 80 | } 81 | }) 82 | .catch(function (err) { 83 | console.log("An error occurred calling scrapePostPage inside deepScrapeTagPage" + ":" + err); 84 | }); 85 | }) 86 | .then(function () { resolve(tagPage); }) 87 | .catch(function (err) { 88 | console.log("An error occurred resolving tagPage inside deepScrapeTagPage" + ":" + err); 89 | }); 90 | }) 91 | .catch(function (err) { 92 | console.log("An error occurred calling scrapeTagPage inside deepScrapeTagPage" + ":" + err); 93 | }); 94 | }); 95 | }; 96 | 97 | exports.scrapeTag = function (tag) { 98 | return new Promise(function (resolve, reject) { 99 | if (!tag) return reject(new Error('Argument "tag" must be specified')); 100 | 101 | axios.get(proxifyURL(listURL + tag), { headers }).then((result) => { 102 | var data = scrape(result.data); 103 | var media = data.entry_data && data.entry_data.TagPage && data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media; 104 | 105 | if (data && media) { 106 | var edges = media.edges; 107 | 108 | async.waterfall([ 109 | (callback) => { 110 | var medias = []; 111 | edges.forEach((post) => { 112 | medias.push(exports.scrapePostData(post)) 113 | }); 114 | callback(null, medias); 115 | } 116 | ], (err, results) => { 117 | resolve({ 118 | total: results.length, 119 | medias: results 120 | }) 121 | }) 122 | 123 | } 124 | else { 125 | reject(new Error('Error scraping tag page "' + tag + '"')); 126 | } 127 | }).catch((err) => { 128 | reject(new Error('Error scraping tag page "' + tag + '": ' + err.message)); 129 | }); 130 | }); 131 | }; 132 | exports.scrapeComment = function (shortcode) { 133 | return new Promise(function (resolve, reject) { 134 | if (!shortcode) return reject(new Error('Argument "shortcode" must be specified')); 135 | 136 | axios.get(proxifyURL(postURL+shortcode), { headers }).then((result) => { 137 | var data = scrape(result.data); 138 | var comments = data.entry_data.PostPage[0].graphql.shortcode_media.edge_media_to_parent_comment; 139 | if(comments != undefined){ 140 | let commentList = comments.edges 141 | async.waterfall([ 142 | (callback) => { 143 | var medias = []; 144 | commentList.forEach((post) => { 145 | // console.log(post) 146 | medias.push(post) 147 | }); 148 | callback(null, medias); 149 | } 150 | ], (err, results) => { 151 | if(err){ 152 | reject(new Error('comment not found for "' + page + '"')); 153 | } 154 | resolve({ 155 | total: comments.count, 156 | medias: results 157 | }) 158 | }) 159 | }else { 160 | reject(new Error('comment not found for "' + page + '"')); 161 | } 162 | 163 | }).catch((err) => { 164 | reject(new Error('Error scraping page "' + page + '"')); 165 | }); 166 | }); 167 | }; 168 | 169 | exports.scrapePostData = function (post) { 170 | var scrapedData = { 171 | media_id: post.node.id, 172 | shortcode: post.node.shortcode, 173 | text: post.node.edge_media_to_caption.edges[0] && post.node.edge_media_to_caption.edges[0].node.text, 174 | comment_count: post.node.edge_media_to_comment.count, 175 | like_count: post.node.edge_liked_by.count, 176 | display_url: post.node.display_url, 177 | owner_id: post.node.owner.id, 178 | date: post.node.taken_at_timestamp, 179 | thumbnail: post.node.thumbnail_src, 180 | thumbnail_resource: post.node.thumbnail_resources, 181 | is_video: post.node.is_video 182 | } 183 | 184 | if (post.node.is_video) { 185 | scrapedData.video_view_count = post.node.video_view_count; 186 | } 187 | 188 | return post; 189 | } 190 | 191 | exports.scrapePostCode = function (code) { 192 | return new Promise(function (resolve, reject) { 193 | if (!code) return reject(new Error('Argument "code" must be specified')); 194 | 195 | axios.get(proxifyURL(postURL + code), { headers }).then((result) => { 196 | //if (err) return reject(err); 197 | 198 | var data = scrape(result.data); 199 | if (data && data.entry_data && 200 | data.entry_data.PostPage[0] && 201 | data.entry_data.PostPage[0].graphql && 202 | data.entry_data.PostPage[0].graphql.shortcode_media) { 203 | resolve(data.entry_data.PostPage[0].graphql.shortcode_media); 204 | } 205 | else { 206 | reject(new Error('Error scraping post page "' + code + '"')); 207 | } 208 | }).catch((err) => { 209 | reject(new Error('Error scraping post page "' + code + '":' + err)); 210 | }); 211 | }); 212 | } 213 | 214 | exports.scrapeLocation = function (id) { 215 | return new Promise(function (resolve, reject) { 216 | if (!id) return reject(new Error('Argument "id" must be specified')); 217 | 218 | axios.get(proxifyURL(locURL + id), { headers }).then((result) => { 219 | var data = scrape(result.data); 220 | 221 | if (data && data.entry_data && (typeof data.entry_data.LocationsPage !== "undefined")) { 222 | resolve(data.entry_data.LocationsPage[0].location); 223 | } 224 | else { 225 | reject(new Error('Error scraping location page "' + id + '"')); 226 | } 227 | }).catch((err) => { 228 | reject(new Error('Error scraping user page "' + id + '"')); 229 | });; 230 | }); 231 | } 232 | var scrape = function (html) { 233 | try { 234 | var dataString = html.match(dataExp)[1]; 235 | var json = JSON.parse(dataString); 236 | } 237 | catch (e) { 238 | if (process.env.NODE_ENV != 'production') { 239 | console.error('The HTML returned from instagram was not suitable for scraping'); 240 | } 241 | return null 242 | } 243 | 244 | return json; 245 | } 246 | --------------------------------------------------------------------------------