├── .gitignore
├── Instaloader_scripts
    ├── automate_commandline_execution.py
    ├── get_profile_posts
    │   ├── get_profile_posts
    │   │   └── main.py
    │   └── readme.md
    ├── hashtag_post_between_dates.py
    ├── instagram_comments_to_xlsx.py
    ├── json_posts_to_xlsx.py
    ├── posts_to_sql.py
    └── profile_posts_to_xlsx_fast
    │   ├── posts_to_xlsx
    │       ├── export.py
    │       ├── extractor.py
    │       ├── main.py
    │       └── query.py
    │   └── readme.md
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/automate_commandline_execution.py:
--------------------------------------------------------------------------------
  1 | # Libraries
  2 | import pandas as pd
  3 | import json
  4 | import glob
  5 | from datetime import datetime
  6 | import os
  7 | import time
  8 | 
  9 | # This code is an example of how to automate the execution of commands in terminal using Instaloader.
 10 | # You'll need an Instagram account to login in.
 11 | # This is necessary to avoid the limits of the Instagram API.
 12 | # Once you launch main.py, you will be asked to enter your account password.
 13 | 
 14 | # STEP 1: Define the variables
 15 | your_account = "your_account_login"
 16 | accounts_list = ["account1", "account2", "etc"]
 17 | 
 18 | 
 19 | def to_xlsx(file):
 20 |     info = {}
 21 | 
 22 |     with open(file) as f:
 23 |         data = json.load(f)
 24 | 
 25 |         try:
 26 |             info["type"] = data["node"]["__typename"]
 27 |         except (IndexError, KeyError):
 28 |             info["type"] = "null"
 29 |         try:
 30 |             info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
 31 |         except (IndexError, KeyError):
 32 |             info["post_text"] = "null"
 33 |         try:
 34 |             info["caption"] = data["node"]["accessibility_caption"]
 35 |         except (IndexError, KeyError):
 36 |             info["caption"] = "no-caption"
 37 |         try:
 38 |             info["has_audio"] = data["node"]["has_audio"]
 39 |         except (IndexError, KeyError):
 40 |             info["has_audio"] = "null"
 41 |         try:
 42 |             info["location"] = data["node"]["location"]["name"]
 43 |         except (IndexError, KeyError, TypeError):
 44 |             info["location"] = "none"
 45 |         try:
 46 |             info["location-id"] = data["node"]["location"]["id"]
 47 |         except (IndexError, KeyError, TypeError):
 48 |             info["location-id"] = "none"
 49 |         try:
 50 |             info["account_name"] = data["node"]["owner"]["id"]
 51 |         except (IndexError, KeyError, TypeError):
 52 |             info["account_name"] = "null"
 53 |         try:
 54 |             info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"]
 55 |         except KeyError:
 56 |             info["shortcode"] = 0
 57 |         try:
 58 |             dt = data["node"]["taken_at_timestamp"]
 59 |             info["timestamp"] = datetime.fromtimestamp(dt)
 60 |         except KeyError:
 61 |             info["timestamp"] = 0
 62 |         try:
 63 |             info["like_count"] = data["node"]["edge_media_preview_like"]["count"]
 64 |         except KeyError:
 65 |             info["like_count"] = 0
 66 |         try:
 67 |             info["comment_count"] = data["node"]["edge_media_to_comment"]["count"]
 68 |         except KeyError:
 69 |             info["comment_count"] = 0
 70 |         try:
 71 |             info["video_view_count"] = data["node"]["video_view_count"]
 72 |         except (IndexError, KeyError):
 73 |             info["video_view_count"] = "null"
 74 |         try:
 75 |             info["comments_disabled"] = data["node"]["comments_disabled"]
 76 |         except KeyError:
 77 |             info["comments_disabled"] = "null"
 78 |         try:
 79 |             info["full_name"] = data["node"]["owner"]["full_name"]
 80 |         except KeyError:
 81 |             info["full_name"] = "null"
 82 |         try:
 83 |             info["is_professional_account"] = data["node"]["owner"]["is_professional_account"]
 84 |         except KeyError:
 85 |             info["is_professional_account"] = "null"
 86 |         try:
 87 |             info["is_business_account"] = data["node"]["owner"]["is_business_account"]
 88 |         except KeyError:
 89 |             info["is_business_account"] = "null"
 90 |         try:
 91 |             info["is_verified"] = data["node"]["owner"]["is_verified"]
 92 |         except KeyError:
 93 |             info["is_verified"] = "null"
 94 |         try:
 95 |             info["is_video"] = data["node"]["is_video"]
 96 |         except KeyError:
 97 |             info["is_video"] = "null"
 98 |         try:
 99 |             info["category_name"] = data["node"]["owner"]["category_name"]
100 |         except KeyError:
101 |             info["category_name"] = "null"
102 |         try:
103 |             info["followed_by"] = data["node"]["owner"]["edge_followed_by"]["count"]
104 |         except KeyError:
105 |             info["followed_by"] = "null"
106 |         return info
107 | 
108 | 
109 | 
110 | # THIS IS THE MAIN FOR LOOP TO ITERATE OVER ACCOUNTS
111 | for insta in accounts_list:
112 | 
113 |     global_df = []
114 |     # THIS IS THE COMMAND EXECUTED IN TERMINAL
115 |     os.system(f"instaloader {insta} --no-videos --no-pictures --no-captions --no-compress-json --max-connection-attempts 0  --login {your_account}")
116 | 
117 |     json_files = glob.glob(f"{insta}/*.json")  # Path To JSON FILES
118 |     amount = len(json_files)  # Count nº of files for progress bar
119 | 
120 |     for json_file in json_files:
121 |         print(json_file)
122 |         info = to_xlsx(json_file)  # This is the main function
123 | 
124 |         df = pd.DataFrame({
125 |             "type": info["type"],
126 |             "post_date": info["timestamp"],
127 |             "account_id": info["account_name"],
128 |             "full_name": info["full_name"],
129 |             "text": info["post_text"],
130 |             "caption": info["caption"],
131 |             "post_shortcode": info["shortcode"],
132 |             "like_count": info["like_count"],
133 |             "comment_count": info["comment_count"],
134 |             "is_video": info["is_video"],
135 |             "has_audio":  info["has_audio"],
136 |             "video_view_count": info["video_view_count"],
137 |             "comments_policy": info["comments_disabled"],
138 |             "is_professional": info["is_professional_account"],
139 |             "is_business": info["is_business_account"],
140 |             "is_verified": info["is_verified"],
141 |             "person_category": info["category_name"],
142 |             "location": info["location"],
143 |             "location_id": info["location-id"]
144 |         }, index=[1])
145 |         global_df.append(df)
146 | 
147 |     final = pd.concat(global_df)
148 |     final.to_excel(f"datasets/{insta}.xlsx", index=False)  # Your Filename
149 |     print("job done!")
150 |     print("sleeping for 1 minute")
151 |     time.sleep(0)
152 |     print("Start new")
153 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/get_profile_posts/get_profile_posts/main.py:
--------------------------------------------------------------------------------
  1 | import instaloader
  2 | import pandas as pd
  3 | import json
  4 | import time
  5 | import os
  6 | import glob
  7 | from datetime import datetime
  8 | 
  9 | # Global Variables. Please, define usernames (without @)
 10 | 
 11 | profile_list = ["username", "username", "username"]
 12 | save_path = "output_folder/"
 13 | minutes = 300 # Sleep Time between users. 5 minutes recommend.
 14 | 
 15 | def get_profile_posts(username, save_path):
 16 |     save_path = save_path+username+"/"
 17 |     if not os.path.exists(save_path):
 18 |         os.makedirs(save_path)
 19 | 
 20 |     L = instaloader.Instaloader(
 21 |         download_pictures=False,
 22 |         download_videos=False,
 23 |         download_video_thumbnails=False,
 24 |         compress_json=False,
 25 |         download_geotags=False,
 26 |         post_metadata_txt_pattern=None,
 27 |         max_connection_attempts=0,
 28 |         download_comments=False,
 29 |         )
 30 | 
 31 |     profile = instaloader.Profile.from_username(L.context, username)
 32 |     posts = profile.get_posts()
 33 |     for post in posts:
 34 |         post_sleep = 1 # Sleep 1 seconds between posts
 35 |         print("sleeping for: " + str(post_sleep) + " seconds")
 36 |         time.sleep(post_sleep)
 37 | 
 38 |         data = post.__dict__
 39 |         data_node = data["_node"]
 40 |         captured_on = time.strftime("%Y-%m-%d")
 41 |         file_name = captured_on+"_"+post.shortcode
 42 |         with open(os.path.join(save_path, file_name+".json"), "w", encoding='utf-8') as write_file:
 43 |             json.dump(data_node, write_file, sort_keys=True, indent=4, ensure_ascii=False)
 44 |             print(write_file)
 45 | 
 46 | 
 47 | def decode_jsons(username, save_path):
 48 | 
 49 |     path_to_files = save_path+username+"/"
 50 |     json_files = glob.glob(os.path.join(path_to_files, "*.json"))
 51 | 
 52 |     list_of_df = []
 53 | 
 54 |     for file in json_files:
 55 |         with open(file, encoding="utf-8") as f:
 56 |             data = json.load(f)
 57 | 
 58 |             list_owner_id = []
 59 |             list_post_date = []
 60 |             list_likes = []
 61 |             list_comments = []
 62 |             list_caption = []
 63 |             list_of_tagged_users = []
 64 |             list_hashtags_in_text = []
 65 |             list_is_video = []
 66 |             list_post_shortcode = []
 67 | 
 68 |             user_id = data["owner"]["id"]
 69 |             list_owner_id.append(user_id)
 70 | 
 71 |             date = datetime.fromtimestamp(data["taken_at_timestamp"])
 72 |             list_post_date.append(date)
 73 | 
 74 |             is_video = data["is_video"]
 75 |             list_is_video.append(is_video)
 76 | 
 77 |             try:
 78 |                 vid_v_count = data["video_view_count"]
 79 |             except KeyError:
 80 |                 vid_v_count = "FALSE"
 81 |                 pass
 82 | 
 83 |             shortcode = "https://www.instagram.com/p/"+data["shortcode"]
 84 |             list_post_shortcode.append(shortcode)
 85 | 
 86 |             comments = data["edge_media_to_comment"]["count"]
 87 |             list_comments.append(comments)
 88 | 
 89 |             try:
 90 |                 tagged_users = data["edge_media_to_tagged_user"]["edges"]
 91 |                 list_of_tagged = []
 92 |                 for user in tagged_users:
 93 |                     tagged_user = user["node"]["user"]["username"]
 94 |                     list_of_tagged.append(tagged_user)
 95 |                 list_of_tagged_users.append(list_of_tagged)
 96 |             except KeyError:
 97 |                 list_of_tagged_users.append("False")
 98 | 
 99 |             try:
100 |                 caption = data["edge_media_to_caption"]["edges"][0]["node"]["text"]
101 |                 list_caption.append(caption)
102 |             except IndexError:
103 |                 list_caption.append("No Caption")
104 | 
105 | 
106 |             hashtags_in_text = [word for word in caption.split() if word.startswith("#")]
107 |             list_hashtags_in_text.append(hashtags_in_text)
108 | 
109 |             likes = data["edge_media_preview_like"]["count"]
110 |             list_likes.append(likes)
111 | 
112 |             df = pd.DataFrame({
113 |                 "username": username,
114 |                 "user_id": list_owner_id,
115 |                 "post_date": list_post_date,
116 |                 "caption": list_caption,
117 |                 "likes": list_likes,
118 |                 "comments": list_comments,
119 |                 "tagged_users": list_of_tagged_users,
120 |                 "hashtags": list_hashtags_in_text,
121 |                 "is_video": list_is_video,
122 |                 "vid_view_count": vid_v_count,
123 |                 "shortcode": list_post_shortcode,
124 |             })
125 |             list_of_df.append(df)
126 | 
127 |     final_frame = pd.concat(list_of_df)
128 |     final_frame.to_excel(username+".xlsx")
129 | 
130 | def loop():
131 |     for username in profile_list:
132 | 
133 |         print("Getting Data for: "+username)
134 |         get_profile_posts(username, save_path) # This will collect all Instagram data
135 | 
136 |         print("Parsing Data for: " + username)
137 |         decode_jsons(username, save_path) # This will convert Json files to dataframe
138 |         print("Finished: " + username)
139 | 
140 |         actual_time = time.strftime("%H:%M:%S")
141 |         time_sleep = minutes / 60
142 |         print("sleeping for: " + str(time_sleep) + " minutes at " + actual_time)
143 |         time.sleep(minutes)
144 | 
145 | 
146 | def controller():
147 | 
148 |     # First Step:
149 |     # Download posts, extract data, serialize Json and create a .xlsx file for each user
150 |     loop()
151 | 
152 |     # Second Step:
153 |     # Read all .xlsx file downloaded in previous step, and create a unique file with all data
154 |     path = os.getcwd()
155 |     files = glob.glob(os.path.join(path, "*.xlsx"))
156 | 
157 |     list_of_frames = []
158 |     for file in files:
159 |         df = pd.read_excel(file)
160 |         list_of_frames.append(df)
161 |     actual_time = time.strftime("%Y-%m-%d")
162 |     df = pd.concat(list_of_frames)
163 |     df.to_excel("your_dataset-"+actual_time+".xlsx")
164 | 
165 | controller()
166 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/get_profile_posts/readme.md:
--------------------------------------------------------------------------------
 1 | # Instagram Profile Posts to ".xlsx" file
 2 | 
 3 | # Download posts from any public Instagram account
 4 | ### dependencies
 5 | ```bash
 6 | Instaloader (for Instagram data collection)
 7 | pandas (for dataframe creation)
 8 | openpyxl (for xlsx export)
 9 | xlrd (to import xlsx files)
10 | ```
11 | 
12 | ### setup
13 | 
14 | Set the aim accounts in "profile_list" --> main.py (line 11). For example:
15 | ```python
16 | profile_list = ["profile_name1", "profile_name2", "etc"]
17 | ```
18 | 
19 | ### Run the script:
20 | ```
21 | python main.py
22 | ```
23 | 
24 | ### **Output**
25 | The script will create:
26 | - New directory for each account
27 | - Json file for each post 
28 | - .xlsx file for each account with data
29 | - .xlsx file with all accounts in the same dataset
30 | 
31 | #### Citation
32 | *Citation APA Style: Padilla Molina, Adrian (2020). InstaloaderScripts [Software]. Avaliable from: https://github.com/AdriaPadilla/InstaloaderScripts/*
33 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/hashtag_post_between_dates.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from itertools import dropwhile, takewhile
 3 | 
 4 | import instaloader
 5 | import pandas as pd
 6 | 
 7 | def hashtag_post_between(hashtag):
 8 | 
 9 | 	L = instaloader.Instaloader( # Main Class info: https://instaloader.github.io/as-module.html#instaloader-main-class
10 | 		download_pictures=False,
11 | 		download_videos=False, 
12 | 		download_video_thumbnails=False,
13 | 		compress_json=False, 
14 | 		download_geotags=False, 
15 | 		post_metadata_txt_pattern=None, 
16 | 		max_connection_attempts=0,
17 | 		download_comments=False,
18 | 		)
19 | 
20 | 	posts = L.get_hashtag_posts(hashtag)
21 | 
22 | 	SINCE = datetime(2019, 12, 19) # Recent Date / format = (yyyy, mm, dd)
23 | 	UNTIL = datetime(2019, 9, 1) # Oldest Date /format = (yyyy, mm, dd)
24 | 	print("capturing posts from: "+str(SINCE)+" to: "+str(UNTIL))
25 | 
26 | 	owner_username_list = []
27 | 	owner_id_list = []
28 | 	post_date_list = []
29 | 	post_caption_list = []
30 | 	tagged_users_list = [] 
31 | 	caption_mentions_list = []
32 | 	is_video_list = []
33 | 	video_view_count_list = []
34 | 	video_duration_list = []
35 | 	likes_list = []
36 | 	comments_list = []
37 | 	post_date_list = []
38 | 	post_url_list = []
39 | 	hashtags_caption_list = []
40 | 
41 | 	for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)):
42 | 	    owner_username_list.append(post.owner_username)
43 | 	    owner_id_list.append(post.owner_id)
44 | 	    post_date_list.append(post.date_utc)
45 | 	    post_caption_list.append(post.caption)
46 | 	    tagged_users_list.append(post.tagged_users)
47 | 	    caption_mentions_list.append(post.caption_mentions)
48 | 	    is_video_list.append(post.is_video)
49 | 	    video_view_count_list.append(post.video_view_count)
50 | 	    video_duration_list.append(post.video_duration)
51 | 	    likes_list.append(post.likes)
52 | 	    comments_list.append(post.comments)
53 | 	    post_url_list.append(post.shortcode)
54 | 	    hashtags_caption_list.append(post.caption_hashtags)
55 | 
56 | 	    for date, name in zip(post_date_list, owner_username_list):
57 | 	    	print(date, name)
58 | 
59 | 	df = pd.DataFrame({
60 | 		"owner_username": owner_username_list,
61 | 		"owner_id": owner_id_list,
62 | 		"post_date": post_date_list,
63 | 		"likes": likes_list,
64 | 		"comments": comments_list,
65 | 		"post_caption": post_caption_list,
66 | 		"hashtags_caption": hashtags_caption_list,
67 | 		"tagged_users": tagged_users_list,
68 | 		"caption_mentions": caption_mentions_list,
69 | 		"is_video": is_video_list,
70 | 		"video_view_count": video_view_count_list,
71 | 		"video_duration": video_duration_list,
72 | 		"post_shortcode": post_url_list,
73 | 		})
74 | 
75 | 	df.to_excel(hashtag+".xlsx")
76 | 
77 | 	print(df)
78 | 
79 | hashtag_list = ['hastag_1', 'hashtag_2', 'hastahag_n'] # You can use a single hashtag, or iterate the function over a list. Hashtags without "hash" simbol (#)
80 | 
81 | for hashtag in hashtag_list:
82 | 	hashtag_post_between(hashtag)
83 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/instagram_comments_to_xlsx.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import glob
 4 | from datetime import datetime
 5 | 
 6 | post_date_list = []
 7 | id_list = []
 8 | created_at_list = []
 9 | text_list = []
10 | verified_list = []
11 | username_list = []
12 | likes_count_list = []
13 | 
14 | def funcion(file):
15 | 
16 |     post_date = str(file)
17 |     date = post_date.split('.')[0].split("_")[0].split('\\')[1]
18 |     print(date)
19 | 
20 |     with open(file) as f:
21 |         comments = json.load(f)
22 | 
23 |         for comment in comments:
24 |             post_date = date
25 |             id = str(comment["id"])
26 |             timestamp = comment["created_at"]
27 |             created_at = datetime.fromtimestamp(timestamp)  # convert timestamp to date time
28 | 
29 |             text = comment["text"]
30 |             verified = comment["owner"]["is_verified"]
31 |             username = comment["owner"]["username"]
32 |             likes = comment["likes_count"]
33 | 
34 |             post_date_list.append(post_date)
35 |             id_list.append(id)
36 |             created_at_list.append(created_at)
37 |             text_list.append(text)
38 |             verified_list.append(verified)
39 |             username_list.append(username)
40 |             likes_count_list.append(likes)
41 | 
42 | json_files = glob.glob("jsons/*.json")
43 | for file in json_files:
44 |     funcion(file)
45 | 
46 | print("create DF")
47 | 
48 | df = pd.DataFrame({
49 |     "post_date": post_date_list,
50 |     "id": id_list,
51 |     "created_at": created_at_list,
52 |     "text": text_list,
53 |     "verified": verified_list,
54 |     "username": username_list,
55 |     "likes": likes_count_list,
56 |     })
57 | df.to_excel("output.xlsx")
58 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/json_posts_to_xlsx.py:
--------------------------------------------------------------------------------
  1 | # Basic Libraries
  2 | import pandas as pd
  3 | import json
  4 | import glob
  5 | from datetime import datetime
  6 | 
  7 | # This is an exemple for pharsing JSON files downloaded with INSTALOADER. 
  8 | # You'll need to do a querie similar to:
  9 | # Instaloader @username --no-pictures --no-videos --no-metadata --no-compress-json 
 10 | 
 11 | 
 12 | global_df = []
 13 | 
 14 | def to_xlsx(file):
 15 |     info = {}
 16 | 
 17 |     with open(file) as f:
 18 |         data = json.load(f)
 19 |         try:
 20 |             info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
 21 |         except (IndexError, KeyError):
 22 |             info["post_text"] = "null"
 23 |         try:
 24 |             info["account_name"] = data["node"]["owner"]["id"]
 25 |         except KeyError:
 26 |             info["account_name"] = "null"
 27 |         try:
 28 |             info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"]
 29 |         except KeyError:
 30 |             info["shortcode"] = 0
 31 |         try:
 32 |             dt = data["node"]["taken_at_timestamp"]
 33 |             info["timestamp"] = datetime.fromtimestamp(dt)
 34 |         except KeyError:
 35 |             info["timestamp"] = 0
 36 |         try:
 37 |             info["like_count"] = data["node"]["edge_media_preview_like"]["count"]
 38 |         except KeyError:
 39 |             info["like_count"] = 0
 40 |         try:
 41 |             info["comment_count"] = data["node"]["edge_media_to_comment"]["count"]
 42 |         except KeyError:
 43 |             info["comment_count"] = 0
 44 |         try:
 45 |             info["video_view_count"] = data["node"]["video_view_count"]
 46 |         except (IndexError, KeyError):
 47 |             info["video_view_count"] = 0
 48 |         try:
 49 |             info["comments_disabled"] = data["node"]["comments_disabled"]
 50 |         except KeyError:
 51 |             info["comments_disabled"] = "null"
 52 | 
 53 |         # AFEGITS
 54 | 
 55 |         try:
 56 |             info["full_name"] = data["node"]["owner"]["full_name"]
 57 |         except KeyError:
 58 |             info["full_name"] = "null"
 59 |         try:
 60 |             info["is_professional_account"] = data["node"]["owner"]["is_professional_account"]
 61 |         except KeyError:
 62 |             info["is_professional_account"] = "null"
 63 |         try:
 64 |             info["is_business_account"] = data["node"]["owner"]["is_business_account"]
 65 |         except KeyError:
 66 |             info["is_business_account"] = "null"
 67 |         try:
 68 |             info["is_verified"] = data["node"]["owner"]["is_verified"]
 69 |         except KeyError:
 70 |             info["is_verified"] = "null"
 71 |         try:
 72 |             info["is_video"] = data["node"]["is_video"]
 73 |         except KeyError:
 74 |             info["is_video"] = "null"
 75 |         try:
 76 |             info["category_name"] = data["node"]["owner"]["category_name"]
 77 |         except KeyError:
 78 |             info["category_name"] = "null"
 79 | 
 80 |         return info # returning the dict with all data
 81 | 
 82 | 
 83 | json_files = glob.glob("/*.json") # Path To JSON FILES
 84 | ammount = len(json_files)  # Count nº of files for progress bar
 85 | 
 86 | for json_file in json_files:
 87 |     print(json_file)
 88 |     info = to_xlsx(json_file)  # This is the main function
 89 | 
 90 |     df = pd.DataFrame({
 91 |         "post_date": info["timestamp"],
 92 |         "account_id": info["account_name"],
 93 |         "full_name": info["full_name"],
 94 |         "text": info["post_text"],
 95 |         "post_shortcode":  info["shortcode"],
 96 |         "like_count": info["like_count"],
 97 |         "comment_count": info["comment_count"],
 98 |         "is_video": info["is_video"],
 99 |         "video_view_count": info["video_view_count"],
100 |         "comments_policy": info["comments_disabled"],
101 |         "is_professional": info["is_professional_account"],
102 |         "is_business": info["is_business_account"],
103 |         "is_verified": info["is_verified"],
104 |         "person_category": info["category_name"]
105 |     }, index=[1])
106 |     global_df.append(df)
107 | 
108 | final = pd.concat(global_df)
109 | final.to_csv("filename.csv", index=False) # Your Filename
110 | print("job done!")
111 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/posts_to_sql.py:
--------------------------------------------------------------------------------
  1 | # Basic Libraries
  2 | import pandas as pd 
  3 | import json
  4 | import glob
  5 | from datetime import datetime
  6 | 
  7 | # Progress var and sleep functions
  8 | from tqdm import tqdm
  9 | from time import sleep
 10 | 
 11 | # MySQL libraries
 12 | import pymysql
 13 | import sqlalchemy
 14 | from sqlalchemy import create_engine
 15 | 
 16 | # Starting connections
 17 | engine = create_engine("mysql+pymysql://your_db_username:your_pw@localhost/table_name?charset=utf8mb4")
 18 | 
 19 | connection = pymysql.connect(host='localhost',
 20 |                              user='your_db_username',
 21 |                              password='your_pw',
 22 |                              db='table_name',
 23 |                              charset="utf8mb4")
 24 |                              
 25 | # VERY IMPORTANT INFO: Charset, collations or any other encoding in DB must be set to utf8mb4 to support emojis!
 26 | # More info: https://mathiasbynens.be/notes/mysql-utf8mb4
 27 | 
 28 | 
 29 | # Starting Job
 30 | 
 31 | info = {}
 32 | def to_xlsx(file):
 33 | 
 34 | 	with open(file) as f:
 35 | 		data = json.load(f)
 36 | 		try:
 37 | 			info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
 38 | 		except (IndexError, KeyError):
 39 | 			info["post_text"] = "null"
 40 | 		try:
 41 | 			info["account_name"] = data["node"]["owner"]["id"]
 42 | 		except KeyError:
 43 | 			info["account_name"] = "null"
 44 | 		try:
 45 | 			info["shortcode"] = data["node"]["shortcode"]
 46 | 		except KeyError:
 47 | 			info["shortcode"] = 0
 48 | 		try:
 49 | 			dt = data["node"]["taken_at_timestamp"]
 50 | 			info["timestamp"] = datetime.fromtimestamp(dt)
 51 | 		except KeyError:
 52 | 			info["timestamp"] = 0
 53 | 		try:
 54 | 			info["like_count"] = data["node"]["edge_media_preview_like"]["count"] 
 55 | 		except KeyError:
 56 | 			info["like_count"] = 0
 57 | 		try:
 58 | 			info["comment_count"] = data["node"]["edge_media_to_comment"]["count"]
 59 | 		except KeyError: 
 60 | 			info["comment_count"] = 0
 61 | 		try:
 62 | 			info["video_view_count"] = data["node"]["video_view_count"]
 63 | 		except (IndexError, KeyError):
 64 | 			info["video_view_count"] = 0
 65 | 		try:
 66 | 			info["comments_disabled"] = data["node"]["comments_disabled"]
 67 | 		except KeyError:
 68 | 			info["comments_disabled"] = "null"
 69 | 		return info
 70 | 
 71 | 
 72 | json_files = glob.glob("coronavirus/#coronavirusespaña/*.json")
 73 | 
 74 | ammount = len(json_files) # Count nº of files for progress bar 
 75 | message = "Dumping to DB: "
 76 | 
 77 | pbar = tqdm(total=ammount, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b}', desc=message) # Parameters for tqdm progress bar
 78 | 
 79 | for json_file in json_files:
 80 | 	to_xlsx(json_file) # This is the main function
 81 | 
 82 | 	post_text_list = []
 83 | 	account_name_list = []
 84 | 	shortcode_list = []
 85 | 	timestamp_list = []
 86 | 	like_count_list = []
 87 | 	comment_count_list = []
 88 | 	video_view_count_list = []
 89 | 	comments_disabled_list = []
 90 | 
 91 | 
 92 | 	post_text_list.append(info["post_text"])
 93 | 	account_name_list.append(info["account_name"])
 94 | 	shortcode_list.append(info["shortcode"])
 95 | 	timestamp_list.append(info["timestamp"])
 96 | 	like_count_list.append(info["like_count"])
 97 | 	comment_count_list.append(info["comment_count"])
 98 | 	video_view_count_list.append(info["video_view_count"])
 99 | 	comments_disabled_list.append(info["comments_disabled"])
100 | 
101 | 	df = pd.DataFrame({
102 | 		"text":post_text_list,
103 | 		"account":account_name_list,
104 | 		"shortcode":shortcode_list,
105 | 		"timestamp":timestamp_list,
106 | 		"like_count":like_count_list,
107 | 		"comment_count":comment_count_list,
108 | 		"video_view_count":video_view_count_list,
109 | 		"comments_disabled":comments_disabled_list,
110 | 		})
111 | 
112 | 	df.to_sql('coronavirus', index=False, con=engine, if_exists='append', chunksize=1000) # This save each loop on the DataBase
113 | 
114 | 	pbar.update() # In each loop, progress bar updates
115 | 
116 | tqdm._instances.pop().close() # This close all tqdm instances and prevent from re-print
117 | 
118 | print("job done!")
119 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/export.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def framer(data):
 4 |     print("------> From Dict to Dataframe |  wait!")
 5 |     frames_list = []
 6 |     for post in data:
 7 | 
 8 |         frame = pd.DataFrame({
 9 |             "owner_username": post["owner_username"],
10 |             "owner_id": post["owner_id"],
11 |             "post_date": post["post_date"],
12 |             "post_caption": [post["post_caption"]],
13 |             "tagged_users": [post["tagged_users"]],
14 |             "caption_mentions": [post["caption_mentions"]],
15 |             "is_video": post["is_video"],
16 |             "video_view_count": post["video_view_count"],
17 |             "video_duration": post["video_duration"],
18 |             "likes": post["likes"],
19 |             "comments": post["comments"],
20 |             "post_url": post["post_url"],
21 |             "hashtags_caption": [post["hashtags_caption"]],
22 |         })
23 |         frames_list.append(frame)
24 | 
25 |     final_frame = pd.concat(frames_list, ignore_index=True)
26 |     final_frame.to_excel("output.xlsx")
27 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/extractor.py:
--------------------------------------------------------------------------------
 1 | def post_to_dict(post):
 2 |     data = {}
 3 |     data["owner_username"] = post.owner_username
 4 |     data["owner_id"] = post.owner_id
 5 |     data["post_date"] = post.date_utc
 6 |     data["post_caption"] = post.caption
 7 |     data["tagged_users"] = post.tagged_users
 8 |     data["caption_mentions"] = post.caption_mentions
 9 |     data["is_video"] = post.is_video
10 |     data["video_view_count"] = post.video_view_count
11 |     data["video_duration"] = post.video_duration
12 |     data["likes"] = post.likes
13 |     data["comments"] = post.comments
14 |     data["post_url"] = "https://www.instagram.com/p/"+post.shortcode
15 |     data["hashtags_caption"] = post.caption_hashtags
16 | 
17 |     return data
18 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import concurrent.futures
 3 | import query as q
 4 | import extractor as e
 5 | import export as x
 6 | 
 7 | profile_list = ["profile_name1", "profile_name2", "etc"] # Put here the username lists
 8 | 
 9 | def insta_request(profile_list):
10 | 	requested_data_list = []
11 | 	for user in profile_list:
12 | 		insta_response = q.request(user)
13 | 		requested_data_list.extend(insta_response)
14 | 	return requested_data_list
15 | 
16 | def extractor(requested_data):
17 | 	parsed_data_list = []
18 | 	print("------> Parsing responses to dictionary: | wait!")
19 | 	with concurrent.futures.ThreadPoolExecutor() as executor:
20 | 		parsed_data = executor.map(e.post_to_dict, requested_data)
21 | 		parsed_data_list.extend(parsed_data)
22 | 	return parsed_data_list
23 | 
24 | if __name__ == '__main__':
25 | 
26 | 	start_time = time.time()
27 | 
28 | 	requested_data = insta_request(profile_list)
29 | 	parsed_data = extractor(requested_data)
30 | 	x.framer(parsed_data)
31 | 	print("--- %s seconds ---" % (time.time() - start_time))
32 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/query.py:
--------------------------------------------------------------------------------
 1 | import instaloader
 2 | 
 3 | def request(user):
 4 |     print("------> Making request for user: "+user+" wait!")
 5 |     L = instaloader.Instaloader(
 6 |         download_pictures=False,
 7 |         download_videos=False,
 8 |         download_video_thumbnails=False,
 9 |         compress_json=False,
10 |         download_geotags=False,
11 |         post_metadata_txt_pattern=None,
12 |         max_connection_attempts=0,
13 |         download_comments=False,
14 |     )
15 |     profile = instaloader.Profile.from_username(L.context, user)
16 |     posts = profile.get_posts()
17 |     return posts
18 | 


--------------------------------------------------------------------------------
/Instaloader_scripts/profile_posts_to_xlsx_fast/readme.md:
--------------------------------------------------------------------------------
 1 | # Download posts from any public Instagram account
 2 | ### dependencies
 3 | ```bash
 4 | Instaloader (for Instagram data collection)
 5 | pandas (for dataframe creation)
 6 | openpyxl (for xlsx export)
 7 | ```
 8 | 
 9 | ### setup
10 | 
11 | Set the aim accounts in "profile_list" --> main.py (line 7). For example:
12 | ```python
13 | profile_list = ["profile_name1", "profile_name2", "etc"]
14 | ```
15 | 
16 | You can configure data output in query.py
17 | 
18 | The default config will only output the post data in xlsx file.
19 | ```python
20 |         download_pictures=False,
21 |         download_videos=False,
22 |         download_video_thumbnails=False,
23 |         compress_json=False,
24 |         download_geotags=False,
25 |         post_metadata_txt_pattern=None,
26 |         max_connection_attempts=0,
27 |         download_comments=False,
28 | ```
29 | ¿Want to know more about this configuration? visit https://instaloader.github.io/module/instaloader.html
30 | 
31 | ### Run the script:
32 | ```
33 | python main.py
34 | ```
35 | 
36 | ### **Output**
37 | The script will create a file named "output.xlsx" with all the data. 
38 | 
39 | #### Citation
40 | *Citation APA Style: Padilla Molina, Adrian (2020). InstaloaderScripts [Software]. Avaliable from: https://github.com/AdriaPadilla/InstaloaderScripts/*
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 AdriaPadilla
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Instaloader Auxiliar Scripts
 2 | 
 3 | Here you'll find some scripts to make your live easier when working with Instaloader data. 
 4 | 
 5 | ## Important
 6 | 
 7 | Instaloader is constantly improving and adapting the base code to Instagram changes. Some endpoints may change, and some of the scripts can stop working.
 8 | 
 9 | The aim of this repository is to serve as an example, with educational purposes.
10 | 
11 | Please, see: [Instaloader](https://instaloader.github.io/)
12 | 
13 | Some of this scripts have been created to process json files, other are made to amplify/modulate the capabilities of standard extractions, using Instaloader main class to add new fields to the capture.
14 | 
15 | 


--------------------------------------------------------------------------------