├── .gitignore
├── README.md
├── extract.py
└── test.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pyExtractor
 2 | ===========
 3 | 
 4 | A python tool to extract data types such as email, URL, domains and phone numbers.
 5 | 
 6 | # Usage 
 7 | 
 8 | ```
 9 | usage: extract.py [-h] [-e] [-u] [-d] [-m] [-a] [-v VERBOSITY] [--version]
10 |                   filename
11 | 
12 | Extract useful data from a file!
13 | 
14 | positional arguments:
15 |   filename              The filename to extract data from
16 | 
17 | optional arguments:
18 |   -h, --help            show this help message and exit
19 |   -e, --emails          Extract emails
20 |   -u, --urls            Extract URLs
21 |   -d, --domains         Extract domain names
22 |   -m, --mobile          Extract mobile phone numbers (for Singapore only)
23 |   -a, --all             Extract emails, URLS and domain names
24 |   -v VERBOSITY, --verbosity VERBOSITY
25 |                         Increase output verbosity
26 |   --version             show program's version number and exit
27 | ```
28 | 
29 | 
30 | # Example
31 | 
32 | The repository includes a test file that contains some garbage. Clone the repository and run the following command to extract the mobile numbers:
33 | 
34 | 	python extract.py -m test.txt 
35 | 
36 | A `test - mobile.csv` containing the results will be generated.
37 | 
38 | Similarly you can use `-e` for emails, `-u` for URLs, `-d` for domains, or `-a` for all types.
39 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Read in a file
  3 | Extract either:
  4 | 	- emails
  5 | 	- urls
  6 | 	- domains
  7 | 	- mobile (simplified for singapore mobile only)
  8 | 	- all
  9 | Write to a file 
 10 | """
 11 | 
 12 | import re
 13 | import traceback
 14 | import urlparse
 15 | 
 16 | # Email regex for extraction
 17 | # A simple version
 18 | email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
 19 | 
 20 | # URL regex
 21 | # url_regex = re.compile('(https?://\S*)', re.IGNORECASE)
 22 | url_regex = re.compile('(?:(?:https?|ftp|file)://|www\.|ftp\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.IGNORECASE)
 23 | 
 24 | # Singapore phone number regex
 25 | # Starts with 8 or 9, and 8 digit long
 26 | sg_mobile_phone_regex = re.compile('[8-9][0-9]{7}')
 27 | 
 28 | 
 29 | # A convenient enum for the type of data that can be extracted
 30 | class EXTRACT_TYPE:
 31 | 	EMAIL, URL, DOMAIN, MOBILE = 'email', 'url', 'domain', 'mobile'
 32 | 
 33 | 
 34 | def extract(type, in_filename):
 35 | 	# Read the file
 36 | 	file = open(in_filename,"r")
 37 | 	data = file.read()
 38 | 
 39 | 	if (type == EXTRACT_TYPE.EMAIL):
 40 | 		extracted_data_list = extract_emails(data)
 41 | 		print '%d emails extracted to .csv' % len(extracted_data_list)
 42 | 
 43 | 	if (type == EXTRACT_TYPE.URL):
 44 | 		extracted_data_list = extract_urls(data)
 45 | 		print '%d URLs extracted to .csv' % len(extracted_data_list)
 46 | 
 47 | 	if (type == EXTRACT_TYPE.DOMAIN):
 48 | 		extracted_data_list = extract_domains(data)
 49 | 		print '%d domains extracted to .csv' % len(extracted_data_list)
 50 | 
 51 | 	if (type == EXTRACT_TYPE.MOBILE):
 52 | 		extracted_data_list = extract_sg_mobile(data)
 53 | 		print '%d mobile numbers extracted to .csv' % len(extracted_data_list)
 54 | 
 55 | 	# Write to the file
 56 | 	# eg. filename - email.csv
 57 | 	out_filename = in_filename.split('.')[0] + ' - ' + type + '.csv'
 58 | 	file = open(out_filename, "w+")
 59 | 	file.writelines("\n".join(list(extracted_data_list)))
 60 | 	file.close()
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | def cleanup_for_emails(data):
 67 | 	"""
 68 | 	Clean up data
 69 | 	We replace these with @
 70 | 	  [at]  (at)  <at>  /at/  [@]  	
 71 | 	We replace these with .
 72 | 	  [dot]  etc..  [.]
 73 | 	Also replace remove the surrounding whitespace
 74 | 	"""
 75 | 	data = re.sub('\s*[[(</-]?\s*(?:at|@)\s*[])>/-]?\s*', '@', data)
 76 | 	data = re.sub('\s*[[(</-]?\s*(?:dot|\.)\s*[])>/-]?\s*', '.', data)
 77 | 	return data
 78 | 
 79 | 
 80 | def extract_emails(data):
 81 | 	"""
 82 | 	Extract all emails from data. This includes email obfuscation techniques. 
 83 | 	It is impossible to cover all cases, but this method will try as many of the common cases.
 84 | 
 85 | 	>>> extract_emails('samwize@gmail.com')
 86 | 	set(['samwize@gmail.com'])
 87 | 
 88 | 	>>> extract_emails('bluebirdof [dot] happiness [at] yahoo [dot] com')
 89 | 	set(['bluebirdof.happiness@yahoo.com'])
 90 | 	
 91 | 	>>> extract_emails('rajr at uol dot com dot br')
 92 | 	set(['rajr@uol.com.br'])
 93 | 
 94 | 	>>> extract_emails('hinfai/at/gmail/dot/com')
 95 | 	set(['hinfai@gmail.com'])
 96 | 
 97 | 	>>> extract_emails('eehassell   -   at  -  hushmail.com')
 98 | 	set(['eehassell@hushmail.com'])
 99 | 
100 | 	>>> extract_emails('kellydc[.]wanderer[@]gmail<.>com')
101 | 	set(['kellydc.wanderer@gmail.com'])
102 | 
103 | 	"""
104 | 	email_set = set()
105 | 
106 | 	# Clean up the data for email first
107 | 	data = cleanup_for_emails(data)
108 | 
109 | 	# Extract each email
110 | 	for email in email_regex.findall(data):
111 | 		email_set.add(email)
112 | 
113 | 	return email_set
114 | 
115 | 
116 | def extract_urls(data):
117 | 	"""
118 | 	Extract URLs from data.
119 | 
120 | 	>>> extract_urls('yoyoyo http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/ yoyoyo')
121 | 	set(['http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/'])
122 | 
123 | 	>>> extract_urls('ohoho\nhttp://www.google.com.sg/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&ved=0CHQQFjAB&url=http%3A%2F%2Fwww.junda.com%2F&ei=faoXUMKNEsWGrAfhm4DwCg&usg=AFQjCNFXDbUHvVhdvVkPuSgDVU-Pb01EiA&sig2=EkLM-_6En7Jg4_XVAzQYAQ me too\nme too')
124 | 	set(['http://www.google.com.sg/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&ved=0CHQQFjAB&url=http%3A%2F%2Fwww.junda.com%2F&ei=faoXUMKNEsWGrAfhm4DwCg&usg=AFQjCNFXDbUHvVhdvVkPuSgDVU-Pb01EiA&sig2=EkLM-_6En7Jg4_XVAzQYAQ'])
125 | 	
126 | 	>>> extract_urls('https://a.b.com http://a.b.org')
127 | 	set(['https://a.b.com', 'http://a.b.org'])
128 | 
129 | 	"""
130 | 	_set = set()
131 | 	for x in url_regex.findall(data):
132 | 		_set.add(x)
133 | 	return _set
134 | 
135 | 
136 | def extract_domains(data):
137 | 	"""
138 | 	Extract the domains. 
139 | 
140 | 	This method use extract_urls to get all the URLs first, then use urlparse module to get the hostname.
141 | 	"""
142 | 	_set = set()
143 | 	urls = extract_urls(data)
144 | 	for url in urls:
145 | 		hostname = urlparse.urlparse(url).hostname.split(".")
146 | 		hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:])
147 | 		_set.add(hostname)
148 | 	return _set
149 | 
150 | 
151 | def extract_sg_mobile(data):
152 | 	"""
153 | 	Extract Singapore mobile phone numbers 
154 | 
155 | 	The regex is a simplified one. A match starts with 8 or 9, and is 8 digit long.
156 | 	"""
157 | 	_set = set()
158 | 	for phone in sg_mobile_phone_regex.findall(data):
159 | 		_set.add(phone)
160 | 	return _set
161 | 
162 | 
163 | if __name__ == "__main__":
164 | 	import sys
165 | 	import argparse
166 | 	
167 | 	# Describe the data
168 | 	parser = argparse.ArgumentParser(description='Extract useful data from a file!')
169 | 
170 | 	# filename is positional argument
171 | 	parser.add_argument("filename", help="The filename to extract data from")
172 | 	
173 | 	parser.add_argument("-e", "--emails", action='store_true', help="Extract emails")
174 | 	parser.add_argument("-u", "--urls", action='store_true', help="Extract URLs")
175 | 	parser.add_argument("-d", "--domains", action='store_true', help="Extract domain names")
176 | 	parser.add_argument("-m", "--mobile", action='store_true', help="Extract mobile phone numbers (for Singapore only)")
177 | 	parser.add_argument("-a", "--all", action='store_true', help="Extract all data types above")
178 | 	
179 | 	# -v 2
180 | 	parser.add_argument("-v", "--verbosity", help="Increase output verbosity")
181 | 	parser.add_argument('--version', action='version', version='%(prog)s 0.1')
182 | 
183 | 	args = parser.parse_args()
184 | 	# print vars(args)
185 | 
186 | 	if args.emails or args.all:
187 | 		print 'Extracting emails..'
188 | 		extract(EXTRACT_TYPE.EMAIL, args.filename)
189 | 
190 | 	if args.urls or args.all:
191 | 		print 'Extracting URLs..'
192 | 		extract(EXTRACT_TYPE.URL, args.filename)
193 | 
194 | 	if args.domains or args.all:
195 | 		print 'Extracting domains..'
196 | 		extract(EXTRACT_TYPE.DOMAIN, args.filename)
197 | 
198 | 	if args.mobile or args.all:
199 | 		print 'Extracting mobile numbers..'
200 | 		extract(EXTRACT_TYPE.MOBILE, args.filename)
201 | 
202 | 	if args.verbosity:
203 | 		print 'Verbosity level: %s' % args.verbosity
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
1 | Here's some garbage
2 | My email is samwize@github.com
3 | If bounced, try samwize [at] google dot com
4 | facebook.com is great
5 | http://twitter.com and http://mail.google.com are awesome
6 | All fails, you can call me at 90000000 80000000 600000000
7 | Or 91111111
8 | That's it


--------------------------------------------------------------------------------