├── .gitignore ├── Instaloader_scripts ├── automate_commandline_execution.py ├── get_profile_posts │ ├── get_profile_posts │ │ └── main.py │ └── readme.md ├── hashtag_post_between_dates.py ├── instagram_comments_to_xlsx.py ├── json_posts_to_xlsx.py ├── posts_to_sql.py └── profile_posts_to_xlsx_fast │ ├── posts_to_xlsx │ ├── export.py │ ├── extractor.py │ ├── main.py │ └── query.py │ └── readme.md ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Instaloader_scripts/automate_commandline_execution.py: -------------------------------------------------------------------------------- 1 | # Libraries 2 | import pandas as pd 3 | import json 4 | import glob 5 | from datetime import datetime 6 | import os 7 | import time 8 | 9 | # This code is an example of how to automate the execution of commands in terminal using Instaloader. 10 | # You'll need an Instagram account to login in. 11 | # This is necessary to avoid the limits of the Instagram API. 12 | # Once you launch main.py, you will be asked to enter your account password. 13 | 14 | # STEP 1: Define the variables 15 | your_account = "your_account_login" 16 | accounts_list = ["account1", "account2", "etc"] 17 | 18 | 19 | def to_xlsx(file): 20 | info = {} 21 | 22 | with open(file) as f: 23 | data = json.load(f) 24 | 25 | try: 26 | info["type"] = data["node"]["__typename"] 27 | except (IndexError, KeyError): 28 | info["type"] = "null" 29 | try: 30 | info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"] 31 | except (IndexError, KeyError): 32 | info["post_text"] = "null" 33 | try: 34 | info["caption"] = data["node"]["accessibility_caption"] 35 | except (IndexError, KeyError): 36 | info["caption"] = "no-caption" 37 | try: 38 | info["has_audio"] = data["node"]["has_audio"] 39 | except (IndexError, KeyError): 40 | info["has_audio"] = "null" 41 | try: 42 | info["location"] = data["node"]["location"]["name"] 43 | except (IndexError, KeyError, TypeError): 44 | info["location"] = "none" 45 | try: 46 | info["location-id"] = data["node"]["location"]["id"] 47 | except (IndexError, KeyError, TypeError): 48 | info["location-id"] = "none" 49 | try: 50 | info["account_name"] = data["node"]["owner"]["id"] 51 | except (IndexError, KeyError, TypeError): 52 | info["account_name"] = "null" 53 | try: 54 | info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"] 55 | except KeyError: 56 | info["shortcode"] = 0 57 | try: 58 | dt = data["node"]["taken_at_timestamp"] 59 | info["timestamp"] = datetime.fromtimestamp(dt) 60 | except KeyError: 61 | info["timestamp"] = 0 62 | try: 63 | info["like_count"] = data["node"]["edge_media_preview_like"]["count"] 64 | except KeyError: 65 | info["like_count"] = 0 66 | try: 67 | info["comment_count"] = data["node"]["edge_media_to_comment"]["count"] 68 | except KeyError: 69 | info["comment_count"] = 0 70 | try: 71 | info["video_view_count"] = data["node"]["video_view_count"] 72 | except (IndexError, KeyError): 73 | info["video_view_count"] = "null" 74 | try: 75 | info["comments_disabled"] = data["node"]["comments_disabled"] 76 | except KeyError: 77 | info["comments_disabled"] = "null" 78 | try: 79 | info["full_name"] = data["node"]["owner"]["full_name"] 80 | except KeyError: 81 | info["full_name"] = "null" 82 | try: 83 | info["is_professional_account"] = data["node"]["owner"]["is_professional_account"] 84 | except KeyError: 85 | info["is_professional_account"] = "null" 86 | try: 87 | info["is_business_account"] = data["node"]["owner"]["is_business_account"] 88 | except KeyError: 89 | info["is_business_account"] = "null" 90 | try: 91 | info["is_verified"] = data["node"]["owner"]["is_verified"] 92 | except KeyError: 93 | info["is_verified"] = "null" 94 | try: 95 | info["is_video"] = data["node"]["is_video"] 96 | except KeyError: 97 | info["is_video"] = "null" 98 | try: 99 | info["category_name"] = data["node"]["owner"]["category_name"] 100 | except KeyError: 101 | info["category_name"] = "null" 102 | try: 103 | info["followed_by"] = data["node"]["owner"]["edge_followed_by"]["count"] 104 | except KeyError: 105 | info["followed_by"] = "null" 106 | return info 107 | 108 | 109 | 110 | # THIS IS THE MAIN FOR LOOP TO ITERATE OVER ACCOUNTS 111 | for insta in accounts_list: 112 | 113 | global_df = [] 114 | # THIS IS THE COMMAND EXECUTED IN TERMINAL 115 | os.system(f"instaloader {insta} --no-videos --no-pictures --no-captions --no-compress-json --max-connection-attempts 0 --login {your_account}") 116 | 117 | json_files = glob.glob(f"{insta}/*.json") # Path To JSON FILES 118 | amount = len(json_files) # Count nº of files for progress bar 119 | 120 | for json_file in json_files: 121 | print(json_file) 122 | info = to_xlsx(json_file) # This is the main function 123 | 124 | df = pd.DataFrame({ 125 | "type": info["type"], 126 | "post_date": info["timestamp"], 127 | "account_id": info["account_name"], 128 | "full_name": info["full_name"], 129 | "text": info["post_text"], 130 | "caption": info["caption"], 131 | "post_shortcode": info["shortcode"], 132 | "like_count": info["like_count"], 133 | "comment_count": info["comment_count"], 134 | "is_video": info["is_video"], 135 | "has_audio": info["has_audio"], 136 | "video_view_count": info["video_view_count"], 137 | "comments_policy": info["comments_disabled"], 138 | "is_professional": info["is_professional_account"], 139 | "is_business": info["is_business_account"], 140 | "is_verified": info["is_verified"], 141 | "person_category": info["category_name"], 142 | "location": info["location"], 143 | "location_id": info["location-id"] 144 | }, index=[1]) 145 | global_df.append(df) 146 | 147 | final = pd.concat(global_df) 148 | final.to_excel(f"datasets/{insta}.xlsx", index=False) # Your Filename 149 | print("job done!") 150 | print("sleeping for 1 minute") 151 | time.sleep(0) 152 | print("Start new") 153 | -------------------------------------------------------------------------------- /Instaloader_scripts/get_profile_posts/get_profile_posts/main.py: -------------------------------------------------------------------------------- 1 | import instaloader 2 | import pandas as pd 3 | import json 4 | import time 5 | import os 6 | import glob 7 | from datetime import datetime 8 | 9 | # Global Variables. Please, define usernames (without @) 10 | 11 | profile_list = ["username", "username", "username"] 12 | save_path = "output_folder/" 13 | minutes = 300 # Sleep Time between users. 5 minutes recommend. 14 | 15 | def get_profile_posts(username, save_path): 16 | save_path = save_path+username+"/" 17 | if not os.path.exists(save_path): 18 | os.makedirs(save_path) 19 | 20 | L = instaloader.Instaloader( 21 | download_pictures=False, 22 | download_videos=False, 23 | download_video_thumbnails=False, 24 | compress_json=False, 25 | download_geotags=False, 26 | post_metadata_txt_pattern=None, 27 | max_connection_attempts=0, 28 | download_comments=False, 29 | ) 30 | 31 | profile = instaloader.Profile.from_username(L.context, username) 32 | posts = profile.get_posts() 33 | for post in posts: 34 | post_sleep = 1 # Sleep 1 seconds between posts 35 | print("sleeping for: " + str(post_sleep) + " seconds") 36 | time.sleep(post_sleep) 37 | 38 | data = post.__dict__ 39 | data_node = data["_node"] 40 | captured_on = time.strftime("%Y-%m-%d") 41 | file_name = captured_on+"_"+post.shortcode 42 | with open(os.path.join(save_path, file_name+".json"), "w", encoding='utf-8') as write_file: 43 | json.dump(data_node, write_file, sort_keys=True, indent=4, ensure_ascii=False) 44 | print(write_file) 45 | 46 | 47 | def decode_jsons(username, save_path): 48 | 49 | path_to_files = save_path+username+"/" 50 | json_files = glob.glob(os.path.join(path_to_files, "*.json")) 51 | 52 | list_of_df = [] 53 | 54 | for file in json_files: 55 | with open(file, encoding="utf-8") as f: 56 | data = json.load(f) 57 | 58 | list_owner_id = [] 59 | list_post_date = [] 60 | list_likes = [] 61 | list_comments = [] 62 | list_caption = [] 63 | list_of_tagged_users = [] 64 | list_hashtags_in_text = [] 65 | list_is_video = [] 66 | list_post_shortcode = [] 67 | 68 | user_id = data["owner"]["id"] 69 | list_owner_id.append(user_id) 70 | 71 | date = datetime.fromtimestamp(data["taken_at_timestamp"]) 72 | list_post_date.append(date) 73 | 74 | is_video = data["is_video"] 75 | list_is_video.append(is_video) 76 | 77 | try: 78 | vid_v_count = data["video_view_count"] 79 | except KeyError: 80 | vid_v_count = "FALSE" 81 | pass 82 | 83 | shortcode = "https://www.instagram.com/p/"+data["shortcode"] 84 | list_post_shortcode.append(shortcode) 85 | 86 | comments = data["edge_media_to_comment"]["count"] 87 | list_comments.append(comments) 88 | 89 | try: 90 | tagged_users = data["edge_media_to_tagged_user"]["edges"] 91 | list_of_tagged = [] 92 | for user in tagged_users: 93 | tagged_user = user["node"]["user"]["username"] 94 | list_of_tagged.append(tagged_user) 95 | list_of_tagged_users.append(list_of_tagged) 96 | except KeyError: 97 | list_of_tagged_users.append("False") 98 | 99 | try: 100 | caption = data["edge_media_to_caption"]["edges"][0]["node"]["text"] 101 | list_caption.append(caption) 102 | except IndexError: 103 | list_caption.append("No Caption") 104 | 105 | 106 | hashtags_in_text = [word for word in caption.split() if word.startswith("#")] 107 | list_hashtags_in_text.append(hashtags_in_text) 108 | 109 | likes = data["edge_media_preview_like"]["count"] 110 | list_likes.append(likes) 111 | 112 | df = pd.DataFrame({ 113 | "username": username, 114 | "user_id": list_owner_id, 115 | "post_date": list_post_date, 116 | "caption": list_caption, 117 | "likes": list_likes, 118 | "comments": list_comments, 119 | "tagged_users": list_of_tagged_users, 120 | "hashtags": list_hashtags_in_text, 121 | "is_video": list_is_video, 122 | "vid_view_count": vid_v_count, 123 | "shortcode": list_post_shortcode, 124 | }) 125 | list_of_df.append(df) 126 | 127 | final_frame = pd.concat(list_of_df) 128 | final_frame.to_excel(username+".xlsx") 129 | 130 | def loop(): 131 | for username in profile_list: 132 | 133 | print("Getting Data for: "+username) 134 | get_profile_posts(username, save_path) # This will collect all Instagram data 135 | 136 | print("Parsing Data for: " + username) 137 | decode_jsons(username, save_path) # This will convert Json files to dataframe 138 | print("Finished: " + username) 139 | 140 | actual_time = time.strftime("%H:%M:%S") 141 | time_sleep = minutes / 60 142 | print("sleeping for: " + str(time_sleep) + " minutes at " + actual_time) 143 | time.sleep(minutes) 144 | 145 | 146 | def controller(): 147 | 148 | # First Step: 149 | # Download posts, extract data, serialize Json and create a .xlsx file for each user 150 | loop() 151 | 152 | # Second Step: 153 | # Read all .xlsx file downloaded in previous step, and create a unique file with all data 154 | path = os.getcwd() 155 | files = glob.glob(os.path.join(path, "*.xlsx")) 156 | 157 | list_of_frames = [] 158 | for file in files: 159 | df = pd.read_excel(file) 160 | list_of_frames.append(df) 161 | actual_time = time.strftime("%Y-%m-%d") 162 | df = pd.concat(list_of_frames) 163 | df.to_excel("your_dataset-"+actual_time+".xlsx") 164 | 165 | controller() 166 | -------------------------------------------------------------------------------- /Instaloader_scripts/get_profile_posts/readme.md: -------------------------------------------------------------------------------- 1 | # Instagram Profile Posts to ".xlsx" file 2 | 3 | # Download posts from any public Instagram account 4 | ### dependencies 5 | ```bash 6 | Instaloader (for Instagram data collection) 7 | pandas (for dataframe creation) 8 | openpyxl (for xlsx export) 9 | xlrd (to import xlsx files) 10 | ``` 11 | 12 | ### setup 13 | 14 | Set the aim accounts in "profile_list" --> main.py (line 11). For example: 15 | ```python 16 | profile_list = ["profile_name1", "profile_name2", "etc"] 17 | ``` 18 | 19 | ### Run the script: 20 | ``` 21 | python main.py 22 | ``` 23 | 24 | ### **Output** 25 | The script will create: 26 | - New directory for each account 27 | - Json file for each post 28 | - .xlsx file for each account with data 29 | - .xlsx file with all accounts in the same dataset 30 | 31 | #### Citation 32 | *Citation APA Style: Padilla Molina, Adrian (2020). InstaloaderScripts [Software]. Avaliable from: https://github.com/AdriaPadilla/InstaloaderScripts/* 33 | -------------------------------------------------------------------------------- /Instaloader_scripts/hashtag_post_between_dates.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from itertools import dropwhile, takewhile 3 | 4 | import instaloader 5 | import pandas as pd 6 | 7 | def hashtag_post_between(hashtag): 8 | 9 | L = instaloader.Instaloader( # Main Class info: https://instaloader.github.io/as-module.html#instaloader-main-class 10 | download_pictures=False, 11 | download_videos=False, 12 | download_video_thumbnails=False, 13 | compress_json=False, 14 | download_geotags=False, 15 | post_metadata_txt_pattern=None, 16 | max_connection_attempts=0, 17 | download_comments=False, 18 | ) 19 | 20 | posts = L.get_hashtag_posts(hashtag) 21 | 22 | SINCE = datetime(2019, 12, 19) # Recent Date / format = (yyyy, mm, dd) 23 | UNTIL = datetime(2019, 9, 1) # Oldest Date /format = (yyyy, mm, dd) 24 | print("capturing posts from: "+str(SINCE)+" to: "+str(UNTIL)) 25 | 26 | owner_username_list = [] 27 | owner_id_list = [] 28 | post_date_list = [] 29 | post_caption_list = [] 30 | tagged_users_list = [] 31 | caption_mentions_list = [] 32 | is_video_list = [] 33 | video_view_count_list = [] 34 | video_duration_list = [] 35 | likes_list = [] 36 | comments_list = [] 37 | post_date_list = [] 38 | post_url_list = [] 39 | hashtags_caption_list = [] 40 | 41 | for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)): 42 | owner_username_list.append(post.owner_username) 43 | owner_id_list.append(post.owner_id) 44 | post_date_list.append(post.date_utc) 45 | post_caption_list.append(post.caption) 46 | tagged_users_list.append(post.tagged_users) 47 | caption_mentions_list.append(post.caption_mentions) 48 | is_video_list.append(post.is_video) 49 | video_view_count_list.append(post.video_view_count) 50 | video_duration_list.append(post.video_duration) 51 | likes_list.append(post.likes) 52 | comments_list.append(post.comments) 53 | post_url_list.append(post.shortcode) 54 | hashtags_caption_list.append(post.caption_hashtags) 55 | 56 | for date, name in zip(post_date_list, owner_username_list): 57 | print(date, name) 58 | 59 | df = pd.DataFrame({ 60 | "owner_username": owner_username_list, 61 | "owner_id": owner_id_list, 62 | "post_date": post_date_list, 63 | "likes": likes_list, 64 | "comments": comments_list, 65 | "post_caption": post_caption_list, 66 | "hashtags_caption": hashtags_caption_list, 67 | "tagged_users": tagged_users_list, 68 | "caption_mentions": caption_mentions_list, 69 | "is_video": is_video_list, 70 | "video_view_count": video_view_count_list, 71 | "video_duration": video_duration_list, 72 | "post_shortcode": post_url_list, 73 | }) 74 | 75 | df.to_excel(hashtag+".xlsx") 76 | 77 | print(df) 78 | 79 | hashtag_list = ['hastag_1', 'hashtag_2', 'hastahag_n'] # You can use a single hashtag, or iterate the function over a list. Hashtags without "hash" simbol (#) 80 | 81 | for hashtag in hashtag_list: 82 | hashtag_post_between(hashtag) 83 | -------------------------------------------------------------------------------- /Instaloader_scripts/instagram_comments_to_xlsx.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import glob 4 | from datetime import datetime 5 | 6 | post_date_list = [] 7 | id_list = [] 8 | created_at_list = [] 9 | text_list = [] 10 | verified_list = [] 11 | username_list = [] 12 | likes_count_list = [] 13 | 14 | def funcion(file): 15 | 16 | post_date = str(file) 17 | date = post_date.split('.')[0].split("_")[0].split('\\')[1] 18 | print(date) 19 | 20 | with open(file) as f: 21 | comments = json.load(f) 22 | 23 | for comment in comments: 24 | post_date = date 25 | id = str(comment["id"]) 26 | timestamp = comment["created_at"] 27 | created_at = datetime.fromtimestamp(timestamp) # convert timestamp to date time 28 | 29 | text = comment["text"] 30 | verified = comment["owner"]["is_verified"] 31 | username = comment["owner"]["username"] 32 | likes = comment["likes_count"] 33 | 34 | post_date_list.append(post_date) 35 | id_list.append(id) 36 | created_at_list.append(created_at) 37 | text_list.append(text) 38 | verified_list.append(verified) 39 | username_list.append(username) 40 | likes_count_list.append(likes) 41 | 42 | json_files = glob.glob("jsons/*.json") 43 | for file in json_files: 44 | funcion(file) 45 | 46 | print("create DF") 47 | 48 | df = pd.DataFrame({ 49 | "post_date": post_date_list, 50 | "id": id_list, 51 | "created_at": created_at_list, 52 | "text": text_list, 53 | "verified": verified_list, 54 | "username": username_list, 55 | "likes": likes_count_list, 56 | }) 57 | df.to_excel("output.xlsx") 58 | -------------------------------------------------------------------------------- /Instaloader_scripts/json_posts_to_xlsx.py: -------------------------------------------------------------------------------- 1 | # Basic Libraries 2 | import pandas as pd 3 | import json 4 | import glob 5 | from datetime import datetime 6 | 7 | # This is an exemple for pharsing JSON files downloaded with INSTALOADER. 8 | # You'll need to do a querie similar to: 9 | # Instaloader @username --no-pictures --no-videos --no-metadata --no-compress-json 10 | 11 | 12 | global_df = [] 13 | 14 | def to_xlsx(file): 15 | info = {} 16 | 17 | with open(file) as f: 18 | data = json.load(f) 19 | try: 20 | info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"] 21 | except (IndexError, KeyError): 22 | info["post_text"] = "null" 23 | try: 24 | info["account_name"] = data["node"]["owner"]["id"] 25 | except KeyError: 26 | info["account_name"] = "null" 27 | try: 28 | info["shortcode"] = "https://www.instagram.com/p/"+data["node"]["shortcode"] 29 | except KeyError: 30 | info["shortcode"] = 0 31 | try: 32 | dt = data["node"]["taken_at_timestamp"] 33 | info["timestamp"] = datetime.fromtimestamp(dt) 34 | except KeyError: 35 | info["timestamp"] = 0 36 | try: 37 | info["like_count"] = data["node"]["edge_media_preview_like"]["count"] 38 | except KeyError: 39 | info["like_count"] = 0 40 | try: 41 | info["comment_count"] = data["node"]["edge_media_to_comment"]["count"] 42 | except KeyError: 43 | info["comment_count"] = 0 44 | try: 45 | info["video_view_count"] = data["node"]["video_view_count"] 46 | except (IndexError, KeyError): 47 | info["video_view_count"] = 0 48 | try: 49 | info["comments_disabled"] = data["node"]["comments_disabled"] 50 | except KeyError: 51 | info["comments_disabled"] = "null" 52 | 53 | # AFEGITS 54 | 55 | try: 56 | info["full_name"] = data["node"]["owner"]["full_name"] 57 | except KeyError: 58 | info["full_name"] = "null" 59 | try: 60 | info["is_professional_account"] = data["node"]["owner"]["is_professional_account"] 61 | except KeyError: 62 | info["is_professional_account"] = "null" 63 | try: 64 | info["is_business_account"] = data["node"]["owner"]["is_business_account"] 65 | except KeyError: 66 | info["is_business_account"] = "null" 67 | try: 68 | info["is_verified"] = data["node"]["owner"]["is_verified"] 69 | except KeyError: 70 | info["is_verified"] = "null" 71 | try: 72 | info["is_video"] = data["node"]["is_video"] 73 | except KeyError: 74 | info["is_video"] = "null" 75 | try: 76 | info["category_name"] = data["node"]["owner"]["category_name"] 77 | except KeyError: 78 | info["category_name"] = "null" 79 | 80 | return info # returning the dict with all data 81 | 82 | 83 | json_files = glob.glob("/*.json") # Path To JSON FILES 84 | ammount = len(json_files) # Count nº of files for progress bar 85 | 86 | for json_file in json_files: 87 | print(json_file) 88 | info = to_xlsx(json_file) # This is the main function 89 | 90 | df = pd.DataFrame({ 91 | "post_date": info["timestamp"], 92 | "account_id": info["account_name"], 93 | "full_name": info["full_name"], 94 | "text": info["post_text"], 95 | "post_shortcode": info["shortcode"], 96 | "like_count": info["like_count"], 97 | "comment_count": info["comment_count"], 98 | "is_video": info["is_video"], 99 | "video_view_count": info["video_view_count"], 100 | "comments_policy": info["comments_disabled"], 101 | "is_professional": info["is_professional_account"], 102 | "is_business": info["is_business_account"], 103 | "is_verified": info["is_verified"], 104 | "person_category": info["category_name"] 105 | }, index=[1]) 106 | global_df.append(df) 107 | 108 | final = pd.concat(global_df) 109 | final.to_csv("filename.csv", index=False) # Your Filename 110 | print("job done!") 111 | -------------------------------------------------------------------------------- /Instaloader_scripts/posts_to_sql.py: -------------------------------------------------------------------------------- 1 | # Basic Libraries 2 | import pandas as pd 3 | import json 4 | import glob 5 | from datetime import datetime 6 | 7 | # Progress var and sleep functions 8 | from tqdm import tqdm 9 | from time import sleep 10 | 11 | # MySQL libraries 12 | import pymysql 13 | import sqlalchemy 14 | from sqlalchemy import create_engine 15 | 16 | # Starting connections 17 | engine = create_engine("mysql+pymysql://your_db_username:your_pw@localhost/table_name?charset=utf8mb4") 18 | 19 | connection = pymysql.connect(host='localhost', 20 | user='your_db_username', 21 | password='your_pw', 22 | db='table_name', 23 | charset="utf8mb4") 24 | 25 | # VERY IMPORTANT INFO: Charset, collations or any other encoding in DB must be set to utf8mb4 to support emojis! 26 | # More info: https://mathiasbynens.be/notes/mysql-utf8mb4 27 | 28 | 29 | # Starting Job 30 | 31 | info = {} 32 | def to_xlsx(file): 33 | 34 | with open(file) as f: 35 | data = json.load(f) 36 | try: 37 | info["post_text"] = data["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"] 38 | except (IndexError, KeyError): 39 | info["post_text"] = "null" 40 | try: 41 | info["account_name"] = data["node"]["owner"]["id"] 42 | except KeyError: 43 | info["account_name"] = "null" 44 | try: 45 | info["shortcode"] = data["node"]["shortcode"] 46 | except KeyError: 47 | info["shortcode"] = 0 48 | try: 49 | dt = data["node"]["taken_at_timestamp"] 50 | info["timestamp"] = datetime.fromtimestamp(dt) 51 | except KeyError: 52 | info["timestamp"] = 0 53 | try: 54 | info["like_count"] = data["node"]["edge_media_preview_like"]["count"] 55 | except KeyError: 56 | info["like_count"] = 0 57 | try: 58 | info["comment_count"] = data["node"]["edge_media_to_comment"]["count"] 59 | except KeyError: 60 | info["comment_count"] = 0 61 | try: 62 | info["video_view_count"] = data["node"]["video_view_count"] 63 | except (IndexError, KeyError): 64 | info["video_view_count"] = 0 65 | try: 66 | info["comments_disabled"] = data["node"]["comments_disabled"] 67 | except KeyError: 68 | info["comments_disabled"] = "null" 69 | return info 70 | 71 | 72 | json_files = glob.glob("coronavirus/#coronavirusespaña/*.json") 73 | 74 | ammount = len(json_files) # Count nº of files for progress bar 75 | message = "Dumping to DB: " 76 | 77 | pbar = tqdm(total=ammount, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b}', desc=message) # Parameters for tqdm progress bar 78 | 79 | for json_file in json_files: 80 | to_xlsx(json_file) # This is the main function 81 | 82 | post_text_list = [] 83 | account_name_list = [] 84 | shortcode_list = [] 85 | timestamp_list = [] 86 | like_count_list = [] 87 | comment_count_list = [] 88 | video_view_count_list = [] 89 | comments_disabled_list = [] 90 | 91 | 92 | post_text_list.append(info["post_text"]) 93 | account_name_list.append(info["account_name"]) 94 | shortcode_list.append(info["shortcode"]) 95 | timestamp_list.append(info["timestamp"]) 96 | like_count_list.append(info["like_count"]) 97 | comment_count_list.append(info["comment_count"]) 98 | video_view_count_list.append(info["video_view_count"]) 99 | comments_disabled_list.append(info["comments_disabled"]) 100 | 101 | df = pd.DataFrame({ 102 | "text":post_text_list, 103 | "account":account_name_list, 104 | "shortcode":shortcode_list, 105 | "timestamp":timestamp_list, 106 | "like_count":like_count_list, 107 | "comment_count":comment_count_list, 108 | "video_view_count":video_view_count_list, 109 | "comments_disabled":comments_disabled_list, 110 | }) 111 | 112 | df.to_sql('coronavirus', index=False, con=engine, if_exists='append', chunksize=1000) # This save each loop on the DataBase 113 | 114 | pbar.update() # In each loop, progress bar updates 115 | 116 | tqdm._instances.pop().close() # This close all tqdm instances and prevent from re-print 117 | 118 | print("job done!") 119 | -------------------------------------------------------------------------------- /Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/export.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def framer(data): 4 | print("------> From Dict to Dataframe | wait!") 5 | frames_list = [] 6 | for post in data: 7 | 8 | frame = pd.DataFrame({ 9 | "owner_username": post["owner_username"], 10 | "owner_id": post["owner_id"], 11 | "post_date": post["post_date"], 12 | "post_caption": [post["post_caption"]], 13 | "tagged_users": [post["tagged_users"]], 14 | "caption_mentions": [post["caption_mentions"]], 15 | "is_video": post["is_video"], 16 | "video_view_count": post["video_view_count"], 17 | "video_duration": post["video_duration"], 18 | "likes": post["likes"], 19 | "comments": post["comments"], 20 | "post_url": post["post_url"], 21 | "hashtags_caption": [post["hashtags_caption"]], 22 | }) 23 | frames_list.append(frame) 24 | 25 | final_frame = pd.concat(frames_list, ignore_index=True) 26 | final_frame.to_excel("output.xlsx") 27 | -------------------------------------------------------------------------------- /Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/extractor.py: -------------------------------------------------------------------------------- 1 | def post_to_dict(post): 2 | data = {} 3 | data["owner_username"] = post.owner_username 4 | data["owner_id"] = post.owner_id 5 | data["post_date"] = post.date_utc 6 | data["post_caption"] = post.caption 7 | data["tagged_users"] = post.tagged_users 8 | data["caption_mentions"] = post.caption_mentions 9 | data["is_video"] = post.is_video 10 | data["video_view_count"] = post.video_view_count 11 | data["video_duration"] = post.video_duration 12 | data["likes"] = post.likes 13 | data["comments"] = post.comments 14 | data["post_url"] = "https://www.instagram.com/p/"+post.shortcode 15 | data["hashtags_caption"] = post.caption_hashtags 16 | 17 | return data 18 | -------------------------------------------------------------------------------- /Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import concurrent.futures 3 | import query as q 4 | import extractor as e 5 | import export as x 6 | 7 | profile_list = ["profile_name1", "profile_name2", "etc"] # Put here the username lists 8 | 9 | def insta_request(profile_list): 10 | requested_data_list = [] 11 | for user in profile_list: 12 | insta_response = q.request(user) 13 | requested_data_list.extend(insta_response) 14 | return requested_data_list 15 | 16 | def extractor(requested_data): 17 | parsed_data_list = [] 18 | print("------> Parsing responses to dictionary: | wait!") 19 | with concurrent.futures.ThreadPoolExecutor() as executor: 20 | parsed_data = executor.map(e.post_to_dict, requested_data) 21 | parsed_data_list.extend(parsed_data) 22 | return parsed_data_list 23 | 24 | if __name__ == '__main__': 25 | 26 | start_time = time.time() 27 | 28 | requested_data = insta_request(profile_list) 29 | parsed_data = extractor(requested_data) 30 | x.framer(parsed_data) 31 | print("--- %s seconds ---" % (time.time() - start_time)) 32 | -------------------------------------------------------------------------------- /Instaloader_scripts/profile_posts_to_xlsx_fast/posts_to_xlsx/query.py: -------------------------------------------------------------------------------- 1 | import instaloader 2 | 3 | def request(user): 4 | print("------> Making request for user: "+user+" wait!") 5 | L = instaloader.Instaloader( 6 | download_pictures=False, 7 | download_videos=False, 8 | download_video_thumbnails=False, 9 | compress_json=False, 10 | download_geotags=False, 11 | post_metadata_txt_pattern=None, 12 | max_connection_attempts=0, 13 | download_comments=False, 14 | ) 15 | profile = instaloader.Profile.from_username(L.context, user) 16 | posts = profile.get_posts() 17 | return posts 18 | -------------------------------------------------------------------------------- /Instaloader_scripts/profile_posts_to_xlsx_fast/readme.md: -------------------------------------------------------------------------------- 1 | # Download posts from any public Instagram account 2 | ### dependencies 3 | ```bash 4 | Instaloader (for Instagram data collection) 5 | pandas (for dataframe creation) 6 | openpyxl (for xlsx export) 7 | ``` 8 | 9 | ### setup 10 | 11 | Set the aim accounts in "profile_list" --> main.py (line 7). For example: 12 | ```python 13 | profile_list = ["profile_name1", "profile_name2", "etc"] 14 | ``` 15 | 16 | You can configure data output in query.py 17 | 18 | The default config will only output the post data in xlsx file. 19 | ```python 20 | download_pictures=False, 21 | download_videos=False, 22 | download_video_thumbnails=False, 23 | compress_json=False, 24 | download_geotags=False, 25 | post_metadata_txt_pattern=None, 26 | max_connection_attempts=0, 27 | download_comments=False, 28 | ``` 29 | ¿Want to know more about this configuration? visit https://instaloader.github.io/module/instaloader.html 30 | 31 | ### Run the script: 32 | ``` 33 | python main.py 34 | ``` 35 | 36 | ### **Output** 37 | The script will create a file named "output.xlsx" with all the data. 38 | 39 | #### Citation 40 | *Citation APA Style: Padilla Molina, Adrian (2020). InstaloaderScripts [Software]. Avaliable from: https://github.com/AdriaPadilla/InstaloaderScripts/* 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 AdriaPadilla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instaloader Auxiliar Scripts 2 | 3 | Here you'll find some scripts to make your live easier when working with Instaloader data. 4 | 5 | ## Important 6 | 7 | Instaloader is constantly improving and adapting the base code to Instagram changes. Some endpoints may change, and some of the scripts can stop working. 8 | 9 | The aim of this repository is to serve as an example, with educational purposes. 10 | 11 | Please, see: [Instaloader](https://instaloader.github.io/) 12 | 13 | Some of this scripts have been created to process json files, other are made to amplify/modulate the capabilities of standard extractions, using Instaloader main class to add new fields to the capture. 14 | 15 | --------------------------------------------------------------------------------