├── .gitignore ├── README.md └── gmail_export_all_emails.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gmail API - Export all mail messages into CSV 2 | 3 | By default API fetch only 100 messages in single request and they have provided limit of max 500 messages for single request. So, this script will help to fetch mail messages more than 500 or all emails. 4 | 5 | * Python version 3.6.0 6 | * To turn on the Gmail API follow instruction mentioned on [Python Quickstart](https://developers.google.com/gmail/api/quickstart/python) 7 | * Use [this wizard](https://console.developers.google.com/flows/enableapi?apiid=gmail) to create or select a project in the Google Developers Console and automatically turn on the API. 8 | * Put the client_secret.json credentials file (downloaded from Google Developers Console) in the same directory 9 | * Execute gmail_export_all_emails.py 10 | * Script will generate new .csv file in same directory. 11 | * Exported CSV file contain headers DateTime, Subject, Body 12 | * You can manupulate this script as per your requirement (adding new fields in export or importing this date in DB or sniffing emails of specific pattern) 13 | 14 | [Python script](https://raw.githubusercontent.com/imranm/Gmail-API-Read-All-Emails-Python/master/gmail_export_all_emails.py) 15 | -------------------------------------------------------------------------------- /gmail_export_all_emails.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reading GMAIL using Python 3 | - Imran Momin 4 | ''' 5 | 6 | ''' 7 | This script does the following: 8 | - Go to Gmal inbox 9 | - Find and read all messages (you can specify labels to read specific emails) 10 | - Extract details (Date, Subject, Body) and export them to a .csv file 11 | ''' 12 | 13 | ''' 14 | Before running this script, the user should get the authentication by following 15 | the link: https://developers.google.com/gmail/api/quickstart/python 16 | Also, client_secret.json should be saved in the same directory as this file 17 | ''' 18 | from apiclient import discovery 19 | from apiclient import errors 20 | from httplib2 import Http 21 | from oauth2client import file, client, tools 22 | import base64 23 | from bs4 import BeautifulSoup 24 | # import dateutil.parser as parser 25 | import csv 26 | from time import strftime, gmtime 27 | import sys 28 | 29 | def ReadEmailDetails(service, user_id, msg_id): 30 | 31 | temp_dict = { } 32 | 33 | try: 34 | 35 | message = service.users().messages().get(userId=user_id, id=msg_id).execute() # fetch the message using API 36 | payld = message['payload'] # get payload of the message 37 | headr = payld['headers'] # get header of the payload 38 | 39 | 40 | for one in headr: # getting the Subject 41 | if one['name'] == 'Subject': 42 | msg_subject = one['value'] 43 | temp_dict['Subject'] = msg_subject 44 | else: 45 | pass 46 | 47 | 48 | for two in headr: # getting the date 49 | if two['name'] == 'Date': 50 | msg_date = two['value'] 51 | # date_parse = (parser.parse(msg_date)) 52 | # m_date = (date_parse.datetime()) 53 | temp_dict['DateTime'] = msg_date 54 | else: 55 | pass 56 | 57 | 58 | # Fetching message body 59 | email_parts = payld['parts'] # fetching the message parts 60 | part_one = email_parts[0] # fetching first element of the part 61 | part_body = part_one['body'] # fetching body of the message 62 | part_data = part_body['data'] # fetching data from the body 63 | clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8 64 | clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8 65 | clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 66 | soup = BeautifulSoup(clean_two , "lxml" ) 67 | message_body = soup.body() 68 | # message_body is a readible form of message body 69 | # depending on the end user's requirements, it can be further cleaned 70 | # using regex, beautiful soup, or any other method 71 | temp_dict['Message_body'] = message_body 72 | 73 | except Exception as e: 74 | print(e) 75 | temp_dict = None 76 | pass 77 | 78 | finally: 79 | return temp_dict 80 | 81 | 82 | def ListMessagesWithLabels(service, user_id, label_ids=[]): 83 | """List all Messages of the user's mailbox with label_ids applied. 84 | 85 | Args: 86 | service: Authorized Gmail API service instance. 87 | user_id: User's email address. The special value "me" 88 | can be used to indicate the authenticated user. 89 | label_ids: Only return Messages with these labelIds applied. 90 | 91 | Returns: 92 | List of Messages that have all required Labels applied. Note that the 93 | returned list contains Message IDs, you must use get with the 94 | appropriate id to get the details of a Message. 95 | """ 96 | try: 97 | response = service.users().messages().list(userId=user_id, 98 | labelIds=label_ids, 99 | maxResults=500).execute() 100 | 101 | messages = [] 102 | if 'messages' in response: 103 | messages.extend(response['messages']) 104 | 105 | while 'nextPageToken' in response: 106 | page_token = response['nextPageToken'] 107 | 108 | response = service.users().messages().list(userId=user_id, 109 | labelIds=label_ids, 110 | pageToken=page_token, 111 | maxResults=500).execute() 112 | 113 | messages.extend(response['messages']) 114 | 115 | print('... total %d emails on next page [page token: %s], %d listed so far' % (len(response['messages']), page_token, len(messages))) 116 | sys.stdout.flush() 117 | 118 | return messages 119 | 120 | except errors.HttpError as error: 121 | print('An error occurred: %s' % error) 122 | 123 | 124 | if __name__ == "__main__": 125 | print('\n... start') 126 | 127 | # Creating a storage.JSON file with authentication details 128 | SCOPES = 'https://www.googleapis.com/auth/gmail.modify' # we are using modify and not readonly, as we will be marking the messages Read 129 | store = file.Storage('storage.json') 130 | creds = store.get() 131 | 132 | if not creds or creds.invalid: 133 | flow = client.flow_from_clientsecrets('client_secret.json', SCOPES) 134 | creds = tools.run_flow(flow, store) 135 | 136 | GMAIL = discovery.build('gmail', 'v1', http=creds.authorize(Http())) 137 | 138 | user_id = 'me' 139 | label_id_one = 'INBOX' 140 | label_id_two = 'UNREAD' 141 | 142 | print('\n... list all emails') 143 | 144 | # email_list = ListMessagesWithLabels(GMAIL, user_id, [label_id_one,label_id_two]) # to read unread emails from inbox 145 | email_list = ListMessagesWithLabels(GMAIL, user_id, []) 146 | 147 | final_list = [ ] 148 | 149 | print('\n... fetching all emails data, this will take some time') 150 | sys.stdout.flush() 151 | 152 | 153 | #exporting the values as .csv 154 | rows = 0 155 | file = 'emails_%s.csv' % (strftime("%Y_%m_%d_%H%M%S", gmtime())) 156 | with open(file, 'w', encoding='utf-8', newline = '') as csvfile: 157 | fieldnames = ['Subject','DateTime','Message_body'] 158 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter = ',') 159 | writer.writeheader() 160 | 161 | for email in email_list: 162 | msg_id = email['id'] # get id of individual message 163 | email_dict = ReadEmailDetails(GMAIL,user_id,msg_id) 164 | 165 | if email_dict is not None: 166 | writer.writerow(email_dict) 167 | rows += 1 168 | 169 | if rows > 0 and (rows%50) == 0: 170 | print('... total %d read so far' % (rows)) 171 | sys.stdout.flush() 172 | 173 | print('... emails exported into %s' % (file)) 174 | print("\n... total %d message retrived" % (rows)) 175 | sys.stdout.flush() 176 | 177 | 178 | print('... all Done!') 179 | --------------------------------------------------------------------------------