├── .gitignore ├── README.md ├── extract.py └── test.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pyExtractor 2 | =========== 3 | 4 | A python tool to extract data types such as email, URL, domains and phone numbers. 5 | 6 | # Usage 7 | 8 | ``` 9 | usage: extract.py [-h] [-e] [-u] [-d] [-m] [-a] [-v VERBOSITY] [--version] 10 | filename 11 | 12 | Extract useful data from a file! 13 | 14 | positional arguments: 15 | filename The filename to extract data from 16 | 17 | optional arguments: 18 | -h, --help show this help message and exit 19 | -e, --emails Extract emails 20 | -u, --urls Extract URLs 21 | -d, --domains Extract domain names 22 | -m, --mobile Extract mobile phone numbers (for Singapore only) 23 | -a, --all Extract emails, URLS and domain names 24 | -v VERBOSITY, --verbosity VERBOSITY 25 | Increase output verbosity 26 | --version show program's version number and exit 27 | ``` 28 | 29 | 30 | # Example 31 | 32 | The repository includes a test file that contains some garbage. Clone the repository and run the following command to extract the mobile numbers: 33 | 34 | python extract.py -m test.txt 35 | 36 | A `test - mobile.csv` containing the results will be generated. 37 | 38 | Similarly you can use `-e` for emails, `-u` for URLs, `-d` for domains, or `-a` for all types. 39 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read in a file 3 | Extract either: 4 | - emails 5 | - urls 6 | - domains 7 | - mobile (simplified for singapore mobile only) 8 | - all 9 | Write to a file 10 | """ 11 | 12 | import re 13 | import traceback 14 | import urlparse 15 | 16 | # Email regex for extraction 17 | # A simple version 18 | email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE) 19 | 20 | # URL regex 21 | # url_regex = re.compile('(https?://\S*)', re.IGNORECASE) 22 | url_regex = re.compile('(?:(?:https?|ftp|file)://|www\.|ftp\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.IGNORECASE) 23 | 24 | # Singapore phone number regex 25 | # Starts with 8 or 9, and 8 digit long 26 | sg_mobile_phone_regex = re.compile('[8-9][0-9]{7}') 27 | 28 | 29 | # A convenient enum for the type of data that can be extracted 30 | class EXTRACT_TYPE: 31 | EMAIL, URL, DOMAIN, MOBILE = 'email', 'url', 'domain', 'mobile' 32 | 33 | 34 | def extract(type, in_filename): 35 | # Read the file 36 | file = open(in_filename,"r") 37 | data = file.read() 38 | 39 | if (type == EXTRACT_TYPE.EMAIL): 40 | extracted_data_list = extract_emails(data) 41 | print '%d emails extracted to .csv' % len(extracted_data_list) 42 | 43 | if (type == EXTRACT_TYPE.URL): 44 | extracted_data_list = extract_urls(data) 45 | print '%d URLs extracted to .csv' % len(extracted_data_list) 46 | 47 | if (type == EXTRACT_TYPE.DOMAIN): 48 | extracted_data_list = extract_domains(data) 49 | print '%d domains extracted to .csv' % len(extracted_data_list) 50 | 51 | if (type == EXTRACT_TYPE.MOBILE): 52 | extracted_data_list = extract_sg_mobile(data) 53 | print '%d mobile numbers extracted to .csv' % len(extracted_data_list) 54 | 55 | # Write to the file 56 | # eg. filename - email.csv 57 | out_filename = in_filename.split('.')[0] + ' - ' + type + '.csv' 58 | file = open(out_filename, "w+") 59 | file.writelines("\n".join(list(extracted_data_list))) 60 | file.close() 61 | 62 | 63 | 64 | 65 | 66 | def cleanup_for_emails(data): 67 | """ 68 | Clean up data 69 | We replace these with @ 70 | [at] (at) /at/ [@] 71 | We replace these with . 72 | [dot] etc.. [.] 73 | Also replace remove the surrounding whitespace 74 | """ 75 | data = re.sub('\s*[[(/-]?\s*', '@', data) 76 | data = re.sub('\s*[[(/-]?\s*', '.', data) 77 | return data 78 | 79 | 80 | def extract_emails(data): 81 | """ 82 | Extract all emails from data. This includes email obfuscation techniques. 83 | It is impossible to cover all cases, but this method will try as many of the common cases. 84 | 85 | >>> extract_emails('samwize@gmail.com') 86 | set(['samwize@gmail.com']) 87 | 88 | >>> extract_emails('bluebirdof [dot] happiness [at] yahoo [dot] com') 89 | set(['bluebirdof.happiness@yahoo.com']) 90 | 91 | >>> extract_emails('rajr at uol dot com dot br') 92 | set(['rajr@uol.com.br']) 93 | 94 | >>> extract_emails('hinfai/at/gmail/dot/com') 95 | set(['hinfai@gmail.com']) 96 | 97 | >>> extract_emails('eehassell - at - hushmail.com') 98 | set(['eehassell@hushmail.com']) 99 | 100 | >>> extract_emails('kellydc[.]wanderer[@]gmail<.>com') 101 | set(['kellydc.wanderer@gmail.com']) 102 | 103 | """ 104 | email_set = set() 105 | 106 | # Clean up the data for email first 107 | data = cleanup_for_emails(data) 108 | 109 | # Extract each email 110 | for email in email_regex.findall(data): 111 | email_set.add(email) 112 | 113 | return email_set 114 | 115 | 116 | def extract_urls(data): 117 | """ 118 | Extract URLs from data. 119 | 120 | >>> extract_urls('yoyoyo http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/ yoyoyo') 121 | set(['http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/']) 122 | 123 | >>> extract_urls('ohoho\nhttp://www.google.com.sg/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&ved=0CHQQFjAB&url=http%3A%2F%2Fwww.junda.com%2F&ei=faoXUMKNEsWGrAfhm4DwCg&usg=AFQjCNFXDbUHvVhdvVkPuSgDVU-Pb01EiA&sig2=EkLM-_6En7Jg4_XVAzQYAQ me too\nme too') 124 | set(['http://www.google.com.sg/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&ved=0CHQQFjAB&url=http%3A%2F%2Fwww.junda.com%2F&ei=faoXUMKNEsWGrAfhm4DwCg&usg=AFQjCNFXDbUHvVhdvVkPuSgDVU-Pb01EiA&sig2=EkLM-_6En7Jg4_XVAzQYAQ']) 125 | 126 | >>> extract_urls('https://a.b.com http://a.b.org') 127 | set(['https://a.b.com', 'http://a.b.org']) 128 | 129 | """ 130 | _set = set() 131 | for x in url_regex.findall(data): 132 | _set.add(x) 133 | return _set 134 | 135 | 136 | def extract_domains(data): 137 | """ 138 | Extract the domains. 139 | 140 | This method use extract_urls to get all the URLs first, then use urlparse module to get the hostname. 141 | """ 142 | _set = set() 143 | urls = extract_urls(data) 144 | for url in urls: 145 | hostname = urlparse.urlparse(url).hostname.split(".") 146 | hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:]) 147 | _set.add(hostname) 148 | return _set 149 | 150 | 151 | def extract_sg_mobile(data): 152 | """ 153 | Extract Singapore mobile phone numbers 154 | 155 | The regex is a simplified one. A match starts with 8 or 9, and is 8 digit long. 156 | """ 157 | _set = set() 158 | for phone in sg_mobile_phone_regex.findall(data): 159 | _set.add(phone) 160 | return _set 161 | 162 | 163 | if __name__ == "__main__": 164 | import sys 165 | import argparse 166 | 167 | # Describe the data 168 | parser = argparse.ArgumentParser(description='Extract useful data from a file!') 169 | 170 | # filename is positional argument 171 | parser.add_argument("filename", help="The filename to extract data from") 172 | 173 | parser.add_argument("-e", "--emails", action='store_true', help="Extract emails") 174 | parser.add_argument("-u", "--urls", action='store_true', help="Extract URLs") 175 | parser.add_argument("-d", "--domains", action='store_true', help="Extract domain names") 176 | parser.add_argument("-m", "--mobile", action='store_true', help="Extract mobile phone numbers (for Singapore only)") 177 | parser.add_argument("-a", "--all", action='store_true', help="Extract all data types above") 178 | 179 | # -v 2 180 | parser.add_argument("-v", "--verbosity", help="Increase output verbosity") 181 | parser.add_argument('--version', action='version', version='%(prog)s 0.1') 182 | 183 | args = parser.parse_args() 184 | # print vars(args) 185 | 186 | if args.emails or args.all: 187 | print 'Extracting emails..' 188 | extract(EXTRACT_TYPE.EMAIL, args.filename) 189 | 190 | if args.urls or args.all: 191 | print 'Extracting URLs..' 192 | extract(EXTRACT_TYPE.URL, args.filename) 193 | 194 | if args.domains or args.all: 195 | print 'Extracting domains..' 196 | extract(EXTRACT_TYPE.DOMAIN, args.filename) 197 | 198 | if args.mobile or args.all: 199 | print 'Extracting mobile numbers..' 200 | extract(EXTRACT_TYPE.MOBILE, args.filename) 201 | 202 | if args.verbosity: 203 | print 'Verbosity level: %s' % args.verbosity 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | Here's some garbage 2 | My email is samwize@github.com 3 | If bounced, try samwize [at] google dot com 4 | facebook.com is great 5 | http://twitter.com and http://mail.google.com are awesome 6 | All fails, you can call me at 90000000 80000000 600000000 7 | Or 91111111 8 | That's it --------------------------------------------------------------------------------