├── __init__.py
├── features
    ├── __init__.py
    ├── table.py
    ├── files.py
    ├── sentiment.py
    ├── feeds.py
    ├── author.py
    ├── title.py
    ├── url2text.py
    ├── category.py
    ├── pypdf_to_image.py
    ├── file2text.py
    ├── images.py
    ├── entities.py
    ├── keywords.py
    └── main_text.py
├── requirements.txt
├── .gitignore
├── README.md
├── link.py
└── pyteaser_c.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | BeautifulSoup==3.2.1
 2 | beautifulsoup4==4.4.1
 3 | cssselect==0.9.1
 4 | goose-extractor==1.0.25
 5 | jieba==0.38
 6 | langid==1.1.5
 7 | lxml==3.6.0
 8 | nltk==3.2
 9 | numexpr==2.5
10 | numpy==1.10.4
11 | Pillow==3.1.1
12 | pyfscache==0.9.12
13 | requests==2.9.1
14 | scikit-learn==0.17.1
15 | scipy==0.17.0
16 | tables==3.2.2
17 | textblob==0.11.1
18 | treelib==1.3.2
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | #Other
65 | /data
66 | /cache
67 | /features/cache
68 | restats
69 | 


--------------------------------------------------------------------------------
/features/table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from bs4 import BeautifulSoup
 3 | from goose import Goose
 4 | 
 5 | 
 6 | class TableExtractor(object):
 7 | 
 8 |     @classmethod
 9 |     def _get_tables(cls, html):
10 |         """
11 |         Method to extract tables from html
12 |         """
13 | 
14 |         soup = BeautifulSoup(html, 'html.parser')
15 |         return [t for t in soup.find_all('table')]
16 | 
17 |     @classmethod
18 |     def extract(cls, html, page_html):
19 | 
20 |         soup = BeautifulSoup(html, 'html.parser')
21 |         tables = []
22 |         excluded_tags = [
23 |             'script', 'style', 'noscript', 'head', 'meta',
24 |             'header', 'footer', 'link', 'input', 'nav'
25 |         ]
26 | 
27 |         [x.extract() for et in excluded_tags for x in soup.find_all(et) if x]
28 | 
29 |         for t in soup.find_all('table'):
30 |             tables.append(t)
31 | 
32 |         return tables
33 | 
34 | if __name__ == '__main__':
35 |     # this packages should be here but we only need the for improving the
36 |     # extractor therefore it might interfere with the rest of the project
37 |     tE = TableExtractor()
38 |     target_url = 'http://www.artisansofdevizes.com/product-collections/standard-tiles-flagstones/waldorf-limestone-collection-papyrus/'
39 |     article = Goose().extract(target_url)
40 |     print tE.extract(article.raw_html, article.raw_doc)


--------------------------------------------------------------------------------
/features/files.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from bs4 import BeautifulSoup
 3 | from urlparse import urlparse
 4 | import re
 5 | 
 6 | 
 7 | class FilesExtractor(object):
 8 | 
 9 | 	@classmethod
10 | 	def extract(cls, base_url, html):
11 | 
12 | 		soup = BeautifulSoup(html, 'html.parser')
13 | 		pattern = re.compile('.*\.pdf|.*\.xls')
14 | 		base_url_parsed_netloc = urlparse(base_url).netloc.replace('www.', '')
15 | 		file_urls, context_text = [], []
16 | 
17 | 		for a in soup.find_all('a'):
18 | 			try:
19 | 				a_netloc = urlparse(a['href']).netloc.replace('www.', '')
20 | 				if a_netloc == base_url_parsed_netloc:
21 | 
22 | 					a_title = a['title']
23 | 					a_text = a.getText()
24 | 					matches = re.finditer(pattern, a['href'])
25 | 
26 | 					for m in matches:
27 | 
28 | 						# Seek context text
29 | 						txt = []
30 | 						if a_title:
31 | 							txt.append(a_title)
32 | 						if a_text:
33 | 							txt.append(a_text)
34 | 
35 | 						file_urls.append(a['href'])
36 | 						context_text.append(" ".join(txt))
37 | 
38 | 			except KeyError, e:
39 | 				continue
40 | 
41 | 		return file_urls, context_text
42 | 
43 | if __name__ == '__main__':
44 | 	from goose import Goose
45 | 	fE = FilesExtractor()
46 | 	# target_url = 'https://traditionalbrickandstone.co.uk/product/victoria-falls/'
47 | 	target_url = 'http://www.imperialhandmadebricks.co.uk/products/yellow-stock/'
48 | 	article = Goose().extract(target_url)
49 | 	print fE.extract(target_url, article.raw_html)
50 | 


--------------------------------------------------------------------------------
/features/sentiment.py:
--------------------------------------------------------------------------------
 1 | from textblob import TextBlob
 2 | 
 3 | 
 4 | def findSentiment(keywords):
 5 | 
 6 | 	k_aux = {}
 7 | 	for k in keywords:
 8 | 		blob = TextBlob(k)
 9 | 		k_aux[k] = {}
10 | 
11 | 		if blob.sentiment.polarity < 0:
12 | 			k_aux[k]['word'] = 'negative'
13 | 		elif blob.sentiment.polarity > 0:
14 | 			k_aux[k]['word'] = 'positive'
15 | 		else:
16 | 			k_aux[k]['word'] = 'neutral'
17 | 
18 | 		k_aux[k]['sentiment'] = blob.sentiment.polarity
19 | 		k_aux[k]['subjectivity'] = blob.sentiment.subjectivity
20 | 	keywords = k_aux
21 | 
22 | 	return keywords
23 | 
24 | def getSentimentText(text):
25 | 	item_aux = {}
26 | 	blob = TextBlob(text)
27 | 
28 | 	if blob.sentiment.polarity < 0:
29 | 		item_aux['word'] = 'negative'
30 | 	elif blob.sentiment.polarity > 0:
31 | 		item_aux['word'] = 'positive'
32 | 	else:
33 | 		item_aux['word'] = 'neutral'
34 | 
35 | 	item_aux['sentiment'] = blob.sentiment.polarity
36 | 	item_aux['subjectivity'] = blob.sentiment.subjectivity
37 | 
38 | 	return item_aux
39 | 
40 | if __name__ == '__main__':
41 | 	text = '''
42 | 	The titular threat of The Blob has always struck me as the ultimate movie
43 | 	monster: an insatiably hungry, amoeba-like mass able to penetrate
44 | 	virtually any safeguard, capable of--as a doomed doctor chillingly
45 | 	describes it--"assimilating flesh on contact.
46 | 	Snide comparisons to gelatin be damned, it's a concept with the most
47 | 	devastating of potential consequences, not unlike the grey goo scenario
48 | 	proposed by technological theorists fearful of
49 | 	artificial intelligence run rampant.
50 | 	'''
51 | 	print getSentimentText(text)


--------------------------------------------------------------------------------
/features/feeds.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | from urlparse import urlparse
 5 | 
 6 | 
 7 | class FeedsExtractor(object):
 8 | 
 9 | 	@classmethod
10 | 	def extract(self, url):
11 | 		rss = []
12 | 		headers = {'Accept': ':text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 | 				   'Accept-Encoding': 'gzip,deflate,sdch',
14 | 				   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'}
15 | 
16 | 		r = requests.get(url, headers=headers)
17 | 		soup = BeautifulSoup(r.text, "html.parser")
18 | 
19 | 		for a in soup.find_all(['a', 'link']):
20 | 			href = a.get('href')
21 | 			# find urls that contain .rss
22 | 			if href:
23 | 				if 'rss' in href:
24 | 					rss.append(href)
25 | 				elif '/feed' in href:
26 | 					rss.append(href)
27 | 
28 | 		if not rss:
29 | 			feep_paths = ['feed', 'rss']
30 | 			o = urlparse(url)
31 | 			clean_url = o.scheme + "://" + o.netloc
32 | 
33 | 			for f in feep_paths:
34 | 				try_url = clean_url + "/" + f
35 | 				r = requests.get(try_url, headers=headers)
36 | 				if r.status_code == 200:
37 | 					rss.append(try_url)
38 | 
39 | 		return self._clean_rss(rss, url)
40 | 
41 | 	@classmethod
42 | 	def _clean_rss(self, rss, base_url):
43 | 		o_base = urlparse(base_url)
44 | 		for i, item in enumerate(rss):
45 | 			o = urlparse(item)
46 | 			scheme = o.scheme
47 | 			netloc = o.netloc
48 | 			if not o.scheme:
49 | 				scheme = o_base.scheme
50 | 			if not o.netloc:
51 | 				netloc = o_base.netloc
52 | 
53 | 			rss[i] = scheme + "://" + netloc + o.path
54 | 
55 | 		return list(set(rss))
56 | 
57 | if __name__ == '__main__':
58 | 	fE = FeedsExtractor()
59 | 	print ext_rss.extract('http://techcrunch.com')
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Feature Engineering
 2 | The repository contains modules to extract features from text and web pages. The features can be uses as a training data for machine algorithms or to improve your applications. Some of the methods are geared towards news articles but they also work with other domains. If you are not a Python programmer or need to do feature engineering at a larger scale, you can use [the API](https://market.mashape.com/adlegant/article-analysis) 
 3 | 
 4 | ## Installation
 5 | ```
 6 | git clone git@github.com:webeng/feature_engineering.git
 7 | cd feature_engineering
 8 | virtualenv env
 9 | source env/bin/activate
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | Run `python link.py` to see an example.
14 | 
15 | If you want fast keyword extraction, you will have to [install HDFS](http://www.hdfgroup.org/ftp/HDF5/current/src/unpacked/release_docs/INSTALL). Also, you might have to install pytables by running this command `sudo HDF5_DIR=/usr/local/hdf5/ pip install tables`.  Also add /usr/local/hdf5/lib/ to LD_LIBRARY_PATH. I'll try develop a slower version without the HDFS.
16 | 
17 | # Modules
18 | You can run each module individually to see examples.
19 | 
20 | ## author.py
21 | Extracts the author of an article given a link.
22 | 
23 | ## category.py
24 | Classify a document
25 | 
26 | ## entities.py
27 | Named entity recognition.
28 | 
29 | ## feeds.py
30 | Extracts feed urls given a link.
31 | 
32 | ## images.py (to be added)
33 | Extracts images in a HTML document and ranks them by surface.
34 | 
35 | ## main_text.py (to be added)
36 | Extracts the main text of page given a url.
37 | 
38 | ## keywords.py
39 | Extracts main keywords in a text document using term frequency-inverse document frequency.
40 | 
41 | ## sentiment.py
42 | Analyses the sentiment of a text or keyword.
43 | 
44 | ## title.py
45 | Extracts page titles.
46 | 


--------------------------------------------------------------------------------
/features/author.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | from urlparse import urlparse
 4 | 
 5 | 
 6 | class AuthorExtractor(object):
 7 | 
 8 | 	@classmethod
 9 | 	def clean_author(self, href, text=None):
10 | 		author = None
11 | 		if text:
12 | 			return text
13 | 		else:
14 | 			for part in href.split('/')[::-1]:
15 | 				if part not in ['', '/']:
16 | 					author = part.capitalize().replace('-', ' ')
17 | 					break
18 | 
19 | 		return author
20 | 
21 | 	@classmethod
22 | 	def extract(self, base_url, html):
23 | 		authors = []
24 | 		soup = BeautifulSoup(html, 'html.parser')
25 | 
26 | 		for a in soup.findAll('a'):
27 | 			href = a.get('href')
28 | 
29 | 			if href:
30 | 				if re.search('.author.?/.', href) is not None:
31 | 					authors.append(self.clean_author(href, a.get_text()))
32 | 				elif re.search('.people/.', href) is not None:
33 | 					authors.append(self.clean_author(href, a.get_text()))
34 | 				elif (re.search('.user.?/.', href) is not None) & (re.search('.youtube.com.', href) is None):
35 | 					authors.append(self.clean_author(href, a.get_text()))
36 | 				elif re.search('.editor.?/.', href) is not None:
37 | 					authors.append(self.clean_author(href, a.get_text()))
38 | 				elif re.search('.contributor.?/.', href) is not None:
39 | 					authors.append(self.clean_author(href, a.get_text()))
40 | 
41 | 		if not authors:
42 | 			author = None
43 | 			url_parse = urlparse(base_url)
44 | 			domain = url_parse.netloc.split('.')
45 | 			if len(domain) >= 2:
46 | 				author = domain[1] if domain[0] == 'www' else domain[0]
47 | 
48 | 			authors.append(author.capitalize() + ' Staff')
49 | 
50 | 		return authors
51 | 
52 | if __name__ == '__main__':
53 | 	from goose import Goose
54 | 	aE = AuthorExtractor()
55 | 	target_url = 'http://www.wired.com/2016/03/1000-days-1000-surreal-posters-one-unfortunate-design/'
56 | 	article = Goose().extract(target_url)
57 | 	print aE.extract(target_url, article.raw_html)
58 | 


--------------------------------------------------------------------------------
/features/title.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from goose import Goose
 5 | from goose.article import Article
 6 | from goose.extractors.title import TitleExtractor as TitleExtractorGoose
 7 | from goose.configuration import Configuration
 8 | 
 9 | 
10 | class TitleExtractor(object):
11 | 
12 |     SPLIT_CHARS = ['|', '–', '-']
13 | 
14 |     def __init__(self):
15 |         pass
16 | 
17 |     @classmethod
18 |     def extract_text(cls, tag):
19 |         if tag.string:
20 |             return tag.string.strip().encode('utf-8', 'replace')
21 |         return None
22 |     
23 |     @classmethod
24 |     def _remove_duplicates_keep_order(cls, seq):
25 |         seen = set()
26 |         seen_add = seen.add
27 |         return [x for x in seq if not (x in seen or seen_add(x))]
28 | 
29 |     @classmethod
30 |     def extract(cls, html, html_formated):
31 | 
32 |         potential_titles = []
33 |         soup = BeautifulSoup(html, 'html.parser')
34 | 
35 |         if soup.title:
36 |             page_title = TitleExtractor.extract_text(soup.title)
37 | 
38 |             for split_char in TitleExtractor.SPLIT_CHARS:
39 |                 if split_char in page_title:
40 |                     page_title = page_title.split(split_char)[0].strip()
41 | 
42 |             potential_titles.append(page_title)
43 | 
44 |         for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
45 |             potential_title = TitleExtractor.extract_text(heading_tag)
46 |             if potential_title:
47 |                 potential_titles.append(potential_title)
48 | 
49 |         # Extract article from goose
50 |         article = Article()
51 |         article.raw_html = html
52 |         article.raw_doc = html_formated
53 |         article.doc = article.raw_doc
54 |         try:
55 |             goose_title = TitleExtractorGoose(Configuration(), article).get_title()
56 |         except AttributeError, e:
57 |             goose_title = None
58 | 
59 |         return cls._remove_duplicates_keep_order(list(potential_titles + [goose_title]))
60 |         # return list(set(potential_titles + [goose_title])) it doesn't preserve the order
61 | 
62 | if __name__ == '__main__':
63 | 
64 |     tE = TitleExtractor()
65 |     target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4'
66 |     article = Goose().extract(target_url)
67 |     print tE.extract(article.raw_html, article.raw_doc)
68 | 


--------------------------------------------------------------------------------
/features/url2text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import hashlib
 3 | import requests
 4 | from features.main_text import MainTextExtractor
 5 | from lxml import html
 6 | import pprint
 7 | import re
 8 | from features.file2text import File2Text
 9 | 
10 | 
11 | class Url2Text(object):
12 | 
13 |     @classmethod
14 |     def extract(cls, url, content_type=None):
15 |         texts = []
16 |         file2text = File2Text()
17 | 
18 |         pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*')
19 |         html_pattern = re.compile('.*text\/html.*')
20 | 
21 |         try:
22 |             r = requests.get(url, timeout=30)
23 |         except requests.exceptions.SSLError, e:
24 |             r = requests.get(url, verify=False)
25 | 
26 |         if not content_type:
27 |             content_type = r.headers['Content-Type']
28 | 
29 |         print content_type
30 | 
31 |         matches_html = len(re.findall(html_pattern, content_type))
32 |         matches_pdf = len(re.findall(pdf_pattern, content_type))
33 | 
34 |         if r.status_code == 200:
35 |             if matches_html == 0:
36 | 
37 |                 file_prefix = hashlib.md5(url).hexdigest()
38 | 
39 |                 dst_path = './tmp/'
40 | 
41 |                 dst = dst_path + file_prefix + '_' + url.split('/')[-1]
42 | 
43 |                 with open(dst, 'wb') as f:
44 |                     for chunk in r.iter_content(1024):
45 |                         f.write(chunk)
46 |                 # call PDF2Text
47 |                 texts = [file2text.extract_all(dst)]
48 |             else:
49 |                 texts = filter(None, MainTextExtractor.extract(r.text, html.fromstring(r.text)))
50 | 
51 |         return texts
52 | 
53 | if __name__ == '__main__':
54 |     # this packages should be here but we only need the for improving the
55 |     # extractor therefore it might interfere with the rest of the project
56 |     url2text = Url2Text()
57 |     # PDF
58 |     # target_url = "https://ocs.fas.harvard.edu/files/ocs/files/undergrad_resumes_and_cover_letters.pdf"
59 |     target_url = 'http://www.artisansofdevizes.com/product-collections/standard-tiles-flagstones/waldorf-limestone-collection-papyrus/'
60 |     # Image
61 |     target_url = 'https://onepagelove-wpengine.netdna-ssl.com/wp-content/uploads/2016/10/opl-small-1.jpg'
62 |     # Mp3 if it returns  and error - Run brew install sox  or sudo apt-get install sox
63 |     target_url = 'http://www.noiseaddicts.com/samples_1w72b820/47.mp3'
64 |     # article = Goose().extract(target_url)
65 |     texts = url2text.extract(target_url)
66 |     print texts
67 |     # print tE.extract(article.raw_html, article.raw_doc)


--------------------------------------------------------------------------------
/features/category.py:
--------------------------------------------------------------------------------
 1 | from sklearn.externals import joblib
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | from sklearn.feature_extraction.text import TfidfTransformer
 4 | import cPickle
 5 | import pyfscache
 6 | 
 7 | cache_it = pyfscache.FSCache('./cache', days=10, hours=12, minutes=30)
 8 | 
 9 | 
10 | class Classifier(object):
11 | 
12 | 	def __init__(self, data_path='../data/'):
13 | 		self.data_path = data_path
14 | 
15 | 	@cache_it
16 | 	def getModels(self):
17 | 		with open(self.data_path + '/categories.pkl', 'rb') as f:
18 | 			categories = cPickle.load(f)
19 | 
20 | 		with open(self.data_path + '/category_map.pkl', 'rb') as f:
21 | 			category_map = cPickle.load(f)
22 | 
23 | 		with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
24 | 			clf = cPickle.load(f)
25 | 
26 | 		count_vect = CountVectorizer()
27 | 		with open(self.data_path + '/count_vect.pkl', 'rb') as f:
28 | 			count_vect = cPickle.load(f)
29 | 
30 | 		tfidf_transformer = TfidfTransformer()
31 | 		with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
32 | 			tfidf_transformer = cPickle.load(f)
33 | 
34 | 		with open(self.data_path + '/tree.pkl', 'rb') as f:
35 | 			tree = cPickle.load(f)
36 | 
37 | 		return categories, category_map, clf, count_vect, tfidf_transformer, tree
38 | 
39 | 	def predict(self, text):
40 | 		categories, category_map = [], []
41 | 		categories, category_map, clf, count_vect, tfidf_transformer, tree = self.getModels()
42 | 
43 | 		# tree.show()
44 | 		X_new_counts = count_vect.transform([text])
45 | 		X_new_tfidf = tfidf_transformer.transform(X_new_counts)
46 | 
47 | 		predicted = clf.predict(X_new_tfidf)
48 | 
49 | 		predictions = []
50 | 		for doc, cats in zip([text], predicted):
51 | 			if isinstance(cats, list):
52 | 				predictions += [categories[cat] for cat in cats]
53 | 			else:
54 | 				predictions.append(tree.get_node(cats).tag)
55 | 
56 | 		return predictions
57 | 
58 | if __name__ == '__main__':
59 | 	import cProfile
60 | 	import pstats
61 | 	clf = Classifier()
62 | 	text = 'Six Nations 2016: Wales 67-14 Italy Wales will finish second in the Six Nations after a record-breaking win over Italy. Warren Gatland team scored nine tries on their way to their biggest points total in a Championship game in Cardiff. Scrum-half Rhys Webb started the rout with the opening try within five minutes, and wing George North scored his fourth try in successive games. Dan Biggar also scored a try in a personal tally of 20 points. Replacement Ross Moriarty crossed twice as Wales won by a record margin of 53 points against the Italians - beating the 41-point mark set last year in Rome. Italy were completely outclassed, but crossed twice in the second half through scrum-half Guiglielmo Palazzini and centre Gonzalo Garcia. But for lacklustre first-half displays in the 16-16 draw with Ireland and the 25-21 loss to England, Wales could have been championship contenders. As it is, they will watch England - already crowned champions - go for a Grand Slam in Paris.'
63 | 	print clf.predict(text)
64 | 
65 | 	cProfile.run("clf.predict(text)", 'restats')
66 | 	p = pstats.Stats('restats')
67 | 	p.sort_stats('cumulative').print_stats(30)
68 | 


--------------------------------------------------------------------------------
/features/pypdf_to_image.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Problem:
 3 |   How to Convert PDF to Image with Python Script ?
 4 | 
 5 | Installation:
 6 |   I use ubuntu OS 14.04
 7 |   We use wrapper for ImageMagick [http://www.imagemagick.org/script/index.php] to Convert The PDF file
 8 |   in Python do: 
 9 |     
10 |   $ sudo apt-get install libmagickwand-dev
11 |   $ pip install Wand
12 |   
13 |   now install PIL 
14 |   $ pip install Pillow
15 | 
16 |   More Installation http://sorry-wand.readthedocs.org/en/latest/guide/install.html
17 |   more about wand https://pypi.python.org/pypi/Wand
18 | """
19 | 
20 | from PIL import Image as Img
21 | from wand.image import Image
22 | import uuid
23 | import numpy as np
24 | import glob
25 | import os
26 | import sys
27 | 
28 | def convert(filepdf):
29 |     #used to generate temp file name. so we will not duplicate or replace anything
30 |     uuid_set = str(uuid.uuid4().fields[-1])[:5]
31 |     try:
32 |         #now lets convert the PDF to Image
33 |         #this is good resolution As far as I know
34 |         with Image(filename=filepdf, resolution=200) as img:
35 |             #keep good quality
36 |             img.compression_quality = 80
37 |             #save it to tmp name
38 |             img.save(filename="./data/temp%s.jpg" % uuid_set)
39 |     except Exception, err:
40 |         #always keep track the error until the code has been clean
41 |         #print err
42 |         print err
43 |         return False
44 |     else:
45 |         """
46 |         We finally success to convert pdf to image. 
47 |         but image is not join by it self when we convert pdf files to image. 
48 |         now we need to merge all file
49 |         """
50 |         pathsave = []
51 |         try:
52 |             #search all image in temp path. file name ends with uuid_set value
53 |             list_im = glob.glob("./data/temp%s*.jpg" % uuid_set)
54 |             list_im.sort() #sort the file before joining it
55 |             imgs = [Img.open(i) for i in list_im]
56 |             #now lets Combine several images vertically with Python
57 |             min_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[0][1]
58 |             imgs_comb = np.vstack(
59 |                 (np.asarray(i.resize(min_shape)) for i in imgs))
60 |             # for horizontally  change the vstack to hstack
61 |             imgs_comb = Img.fromarray(imgs_comb)
62 |             pathsave = "./data/my_pdf%s.jpg" % uuid_set
63 |             #now save the image
64 |             imgs_comb.save(pathsave)
65 |             #and then remove all temp image
66 |             for i in list_im:
67 |                 os.remove(i)
68 |         except Exception, err:
69 |             #print err 
70 |             return False
71 |         return pathsave
72 | 
73 | if __name__ == "__main__":
74 |      arg = sys.argv[1]
75 |      result = convert(arg)
76 |      if result:
77 |         print "[*] Succces convert %s and save it to %s" % (arg, result)
78 |      else:
79 |         print "[!] Whoops. something wrong dude. enable err var to track it"
80 | 
81 | """
82 | ===========================================
83 | Running Test:
84 |   python testing-pdf.py zz.pdf
85 |   [*] Succces convert zz.pdf and save it to Resume63245.jpg
86 |   
87 | ===========================================
88 | """
89 | #well I hope this will be useful for you & others.       


--------------------------------------------------------------------------------
/features/file2text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 5 | from pdfminer.converter import TextConverter
 6 | from pdfminer.layout import LAParams
 7 | from pdfminer.pdfpage import PDFPage
 8 | from cStringIO import StringIO
 9 | import textract
10 | from features.pypdf_to_image import convert as convert_pdf_to_image
11 | from time import time
12 | import os
13 | 
14 | 
15 | class File2Text(object):
16 |     """docstring for File2Text"""
17 |     def __init__(self):
18 |         super(File2Text, self).__init__()
19 | 
20 |     def extract(self, src, maxpages=0):
21 |         rsrcmgr = PDFResourceManager()
22 |         retstr = StringIO()
23 |         codec = 'utf-8'
24 |         laparams = LAParams(line_overlap=0.5,
25 |                      char_margin=2.0,
26 |                      line_margin=0.5,
27 |                      word_margin=0.1,
28 |                      boxes_flow=0.5,
29 |                      detect_vertical=True,
30 |                      all_texts=True)
31 |         device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
32 |         # device = TextConverter(rsrcmgr, retstr, codec=codec)
33 |         fp = file(src, 'rb')
34 |         interpreter = PDFPageInterpreter(rsrcmgr, device)
35 |         password = ""
36 |         caching = True
37 |         pagenos = set()
38 | 
39 |         for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
40 |             interpreter.process_page(page)
41 | 
42 |         text = retstr.getvalue()
43 | 
44 |         fp.close()
45 |         device.close()
46 |         retstr.close()
47 |         return text
48 | 
49 |     def extract_all(self, src, maxpages=0):
50 |         if '.pdf' in src:
51 |             try:
52 |                 start = time()
53 |                 text = self.extract(src, maxpages=maxpages)
54 |                 print "case 1 elapsed_time {}s".format(time() - start)
55 |             except Exception, e:
56 |                 start = time()
57 |                 text = textract.process(src)
58 |                 print "case 2 elapsed_time {}s".format(time() - start)
59 | 
60 |         else:
61 |             # TODO: allow other formats
62 |             # return None
63 |             start = time()
64 |             text = textract.process(src)
65 |             print "case 3 elapsed_time {}s".format(time() - start)
66 | 
67 | 
68 |         # if text and len(text.strip()) == 0:
69 |         #   text = None
70 | 
71 |         if not text or len(text) < 10:
72 |             # TODO: Speed this process up
73 |             # return None
74 |             print "...attempting convert_pdf_to_image"
75 |             start = time()
76 |             pdf_path = convert_pdf_to_image(src)
77 |             text = textract.process(pdf_path)
78 |             os.remove(pdf_path)
79 |             print "case 4 elapsed_time {}s".format(time() - start)
80 | 
81 |         return text
82 | 
83 | if __name__ == '__main__':
84 |     src = '/Volumes/FLUFFUSHFS/Datasets/product_properties/data/documents/105293334418129088138c7cf90dacf7_hush-acoustics_Hush-Panel-32_Specifications_NR282-12-Hush-Panel-32.pdf'
85 |     # src = '/Volumes/FLUFFUSHFS/Datasets/product_properties/data/documents/0a3a09b2ddb9615ba06cb2f7b812a3b4_ruukki-uk_C-purlin_Technical-Files_LP-IN05-EN.pdf'
86 |     #print File2Text.extract(src)
87 |     pdf2text = File2Text()
88 |     # print File2Text.extract(src)
89 |     print pdf2text.extract_all(src)
90 |     # print convert_pdf_to_txt(src)
91 | 


--------------------------------------------------------------------------------
/features/images.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from bs4 import BeautifulSoup
  3 | from urlparse import urlparse
  4 | import urllib
  5 | import cStringIO
  6 | from PIL import Image
  7 | 
  8 | 
  9 | class ImagesExtractor(object):
 10 | 
 11 | 	@classmethod
 12 | 	def _normalise_url(cls, base_url, url):
 13 | 		"""
 14 | 		Normalises a relative url to an absolute one if base domain is not
 15 | 		present.
 16 | 		"""
 17 | 		parsed_url = urlparse(url)
 18 | 
 19 | 		if not parsed_url.netloc:
 20 | 			return base_url + '/'.join([segment
 21 | 							 for segment in parsed_url.path.split('/')
 22 | 							 if segment not in ['..', '.', '', None]]) + '?' + parsed_url.query
 23 | 		elif not parsed_url.scheme:
 24 | 			return 'http:' + url
 25 | 
 26 | 		return url
 27 | 
 28 | 	@classmethod
 29 | 	def _process_image(cls, base_url, img_url):
 30 | 		return ImagesExtractor._normalise_url(base_url, img_url)
 31 | 
 32 | 	@classmethod
 33 | 	def _get_zoom_image(cls, img):
 34 | 		if img.parent.get('href'):
 35 | 			return img.parent.get('href')
 36 | 		else: # if not href, try data attributes
 37 | 			for key, value in img.parent.attrs.iteritems():
 38 | 				if key.startswith('data-'):
 39 | 					if value.startswith('http://'):
 40 | 						return value
 41 | 		return None
 42 | 
 43 | 	@classmethod
 44 | 	def _get_meta_image(cls, meta_tag):
 45 | 		return meta_tag.get('content')
 46 | 
 47 | 	@classmethod
 48 | 	def select_top_image(cls, images):
 49 | 		max_surface = 0
 50 | 		selected = None
 51 | 		for img_src in images:
 52 | 			# print img_src
 53 | 			try:
 54 | 				file = cStringIO.StringIO(urllib.urlopen(img_src).read())
 55 | 			except UnicodeDecodeError, e:
 56 | 				print e
 57 | 				continue
 58 | 			except IOError, e:
 59 | 				print e
 60 | 				continue
 61 | 
 62 | 			try:
 63 | 				im = Image.open(file)
 64 | 			except IOError, e:
 65 | 				continue
 66 | 			width, height = im.size
 67 | 			if width > 100 and height > 100:
 68 | 				surface = (width * height) / 2
 69 | 				if surface > max_surface:
 70 | 					selected = img_src
 71 | 					max_surface = surface
 72 | 
 73 | 		return selected
 74 | 
 75 | 	@classmethod
 76 | 	def rank(cls, images):
 77 | 		images_aux = []
 78 | 
 79 | 		for img_src in images:
 80 | 			try:
 81 | 				file = cStringIO.StringIO(urllib.urlopen(img_src).read())
 82 | 				im = Image.open(file)
 83 | 			except UnicodeDecodeError, e:
 84 | 				continue
 85 | 			except IOError, e:
 86 | 				continue
 87 | 
 88 | 			width, height = im.size
 89 | 			if width > 100 and height > 100:
 90 | 				surface = (width * height) / 2
 91 | 				images_aux.append([img_src, surface])
 92 | 
 93 | 		images_aux.sort(key=lambda x: x[1], reverse=True)
 94 | 
 95 | 		return images_aux
 96 | 
 97 | 	@classmethod
 98 | 	def extract(cls, base_url, html):
 99 | 		soup = BeautifulSoup(html, 'html.parser')
100 | 
101 | 		img_tag_urls = filter(None, [img.get('src') for img in soup.find_all('img')])
102 | 		zoom_img_urls = [] # zoom_img_urls = filter(None, [ImagesExtractor._get_zoom_image(img) for img in soup.select('a > img')])
103 | 		meta_img_urls = filter(None, [ImagesExtractor._get_meta_image(mtag) for mtag in soup.select('meta[property=og:image]')])
104 | 		image_urls = img_tag_urls + zoom_img_urls + meta_img_urls
105 | 
106 | 		return [ImagesExtractor._process_image(base_url, image_url) for image_url in image_urls]
107 | 
108 | if __name__ == '__main__':
109 | 	from goose import Goose
110 | 	iE = ImagesExtractor()
111 | 	# target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4'
112 | 	target_url = 'https://www.trilux.com/products/en/Indoor-lighting/Continuous-line-luminaires-and-batten-luminaires/E-Line-LED-IP20-54-rapid-mounting-continuous-line/?retainFilter=true'
113 | 	article = Goose().extract(target_url)
114 | 	print iE.extract(target_url, article.raw_html)
115 | 


--------------------------------------------------------------------------------
/link.py:
--------------------------------------------------------------------------------
  1 | from features.title import TitleExtractor
  2 | from features.main_text import MainTextExtractor
  3 | from features.images import ImagesExtractor
  4 | from features.sentiment import getSentimentText, findSentiment
  5 | from features.keywords import KeywordsExtractor
  6 | from features.entities import Entities
  7 | from features.author import AuthorExtractor
  8 | from features.category import Classifier
  9 | from features.url2text import Url2Text
 10 | from goose import Goose
 11 | from lxml import etree
 12 | from pyteaser_c import Summarize
 13 | from pyteaser_c import SummarizePage
 14 | from pyteaser_c import GetArticle
 15 | from pyteaser_c import keywords
 16 | from textblob import TextBlob
 17 | import langid
 18 | from bs4 import BeautifulSoup
 19 | #import lxml.html
 20 | from lxml import html
 21 | import requests
 22 | import pprint
 23 | import re
 24 | import os
 25 | 
 26 | 
 27 | class NoMainTextException(Exception):
 28 |     pass
 29 | 
 30 | 
 31 | class Link(object):
 32 | 	# def is_html(self):
 33 | 	# 	pass
 34 | 
 35 | 	@classmethod
 36 | 	def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'):
 37 | 		errors, summaries, categories, entities, keywords = [], [], [], [], []
 38 | 		pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*')
 39 | 		html_pattern = re.compile('.*text\/html.*')
 40 | 
 41 | 		article = Goose().extract(link)
 42 | 
 43 | 		content_type = article.__dict__['additional_data']['result'].info()['content-type']
 44 | 		matches_html = len(re.findall(html_pattern, content_type))
 45 | 		matches_pdf = len(re.findall(pdf_pattern, content_type))
 46 | 
 47 | 		if matches_html == 0:
 48 | 			# Textract
 49 | 			url2text = Url2Text()
 50 | 			texts = url2text.extract(link)
 51 | 
 52 | 			k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path)
 53 | 			ent = Entities()
 54 | 			clf = Classifier(data_path=data_path)
 55 | 
 56 | 			return {
 57 | 				"title": os.path.basename(link),
 58 | 				"link": link,
 59 | 				"author": [],
 60 | 				"cleaned_text": texts[0],
 61 | 				"text_sentiment": getSentimentText(texts[0]),
 62 | 				"main_body": None,
 63 | 				"images": None,
 64 | 				"image": None,
 65 | 				"date": article.__dict__['additional_data']['result'].info()['last-modified'],
 66 | 				"tags": k.extract([texts[0]], None, None, 'news')[0],
 67 | 				"entities": ent.extract(texts[0], entity_description),
 68 | 				"language": langid.classify(texts[0])[0],
 69 | 				"summary": Summarize(None, texts[0]),
 70 | 				"categories": clf.predict(texts[0])
 71 | 			}
 72 | 			pass
 73 | 		else:
 74 | 
 75 | 			valid_html = bool(BeautifulSoup(article.raw_html[0:100], "html.parser").find())
 76 | 
 77 | 			if not valid_html:
 78 | 				headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
 79 | 				r = requests.get(link, headers=headers)
 80 | 				article.raw_html = r.text
 81 | 				article.raw_doc = html.fromstring(r.text)
 82 | 
 83 | 			if article.raw_doc is None:
 84 | 				raise NoMainTextException
 85 | 
 86 | 			authors = AuthorExtractor.extract(link, article.raw_html)
 87 | 			publish_date = article.publish_date if article.publish_date else None
 88 | 
 89 | 			if not article.title:
 90 | 				article.title = TitleExtractor.extract(
 91 | 					article.raw_html, article.raw_doc)[0]
 92 | 
 93 | 			k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path)
 94 | 
 95 | 			if article.top_node is not None:
 96 | 				main_body = etree.tostring(article.top_node)
 97 | 			else:
 98 | 				cleant_text_suggestions = MainTextExtractor.extract(article.raw_html, article.raw_doc)
 99 | 				article.cleaned_text = cleant_text_suggestions[1]
100 | 				if not article.cleaned_text:
101 | 					article.cleaned_text = cleant_text_suggestions[2]
102 | 				if not article.cleaned_text:
103 | 					raise NoMainTextException
104 | 				main_body = 'Sorry, we could not detect the main HTML content for this article'
105 | 
106 | 			try:
107 | 				summaries = Summarize(
108 | 					article.title, article.cleaned_text.encode('utf-8', 'ignore'))
109 | 			except Exception, e:
110 | 				summaries.append('We could not make summaries at this time.')
111 | 
112 | 			try:
113 | 				text_sentiment = getSentimentText(article.cleaned_text)
114 | 			except Exception, e:
115 | 				text_sentiment = None
116 | 			text = article.title + " " + article.cleaned_text
117 | 			keywords = k.extract([text], None, None, 'news')[0]
118 | 
119 | 			# Get keywords from meta tag
120 | 			if not keywords:
121 | 				keywords = article.meta_keywords.split(',')
122 | 
123 | 			# Get keywords from Goose
124 | 			if not keywords:
125 | 				keywords = [t for t in article.tags]
126 | 
127 | 			if sentiment:
128 | 				keywords = findSentiment(keywords)
129 | 
130 | 			ent = Entities()
131 | 			try:
132 | 				entities = ent.extract(text, entity_description)
133 | 			except Exception, e:
134 | 				entities.append('We could not extract entities at this time.')
135 | 
136 | 			if sentiment:
137 | 				entities = findSentiment(entities)
138 | 
139 | 			language = article.meta_lang
140 | 
141 | 			if not language:
142 | 				language = langid.classify(article.cleaned_text)[0]
143 | 
144 | 			if language in ['en', 'eo']:
145 | 				clf = Classifier(data_path=data_path)
146 | 				article.categories = clf.predict(text)
147 | 			else:
148 | 				article.categories = ["Article classification not ready for: " + language[0]]
149 | 
150 | 			images = ImagesExtractor.extract(link, article.raw_html)
151 | 
152 | 			if article.top_image:
153 | 				thumbnail = article.top_image.src
154 | 			else:
155 | 				#thumbnail = images[0] if images else None
156 | 				thumbnail = ImagesExtractor.select_top_image(images[0:50])
157 | 
158 | 			return {
159 | 				"title": article.title,
160 | 				"link": article.final_url,
161 | 				"author": authors,
162 | 				"cleaned_text": article.cleaned_text,
163 | 				"text_sentiment": text_sentiment,
164 | 				"main_body": main_body,
165 | 				"images": images,
166 | 				"image": thumbnail,
167 | 				"date": article.publish_date,
168 | 				"tags": keywords,
169 | 				"entities": entities,
170 | 				"language": language,
171 | 				"summary": summaries,
172 | 				"categories": article.categories
173 | 			}
174 | 
175 | if __name__ == '__main__':
176 | 	import pprint
177 | 	l = Link()
178 | 	url = 'https://www.wired.com/2017/05/google-just-made-email-heckuva-lot-easier-deal/'
179 | 	# l = l.extract('http://techcrunch.com/2016/03/18/twitter-says-few-users-have-opted-out-of-its-new-algorithmic-timeline/')
180 | 	#l = l.extract('https://www.wired.com/2017/05/google-just-made-email-heckuva-lot-easier-deal/')
181 | 	# l = l.extract('http://www.independent.co.uk/life-style/gadgets-and-tech/features/google-lens-ai-preview-features-so-impressive-its-scary-a7745686.html')
182 | 	# l = l.extract('https://onepagelove-wpengine.netdna-ssl.com/wp-content/uploads/2016/10/opl-small-1.jpg')
183 | 	target_url = 'http://www.noiseaddicts.com/samples_1w72b820/47.mp3'
184 | 	l = l.extract(target_url)
185 | 
186 | 	pprint.pprint(l)
187 | 	# import requests 
188 | 	# r = requests.get(url)
189 | 	# print r.text
190 | 


--------------------------------------------------------------------------------
/features/entities.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import requests
  3 | 
  4 | 
  5 | class Entities(object):
  6 | 
  7 | 	def remove_return_lines_and_quotes(self, text):
  8 | 		text = text.replace('\n', ' ')
  9 | 		text = text.replace('\t', ' ')
 10 | 		text = text.replace('\r', ' ')
 11 | 		text = text.replace('"', '')
 12 | 		return text
 13 | 
 14 | 	def extract(self, text, entity_description=False):
 15 | 		# We need to clean the text in each method otherwise when we present it
 16 | 		# to the user, it will have a different format
 17 | 		text = self.remove_return_lines_and_quotes(text)
 18 | 		sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
 19 | 
 20 | 		# This function is quite expensive
 21 | 		sentences = [nltk.pos_tag(sent) for sent in sentences]
 22 | 
 23 | 		entities_all = {} if entity_description else []
 24 | 
 25 | 		#stop = stopwords.words('english')
 26 | 		# more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence']
 27 | 		# stop = stopwords.words('english')
 28 | 		# stop = stop + more_stop_words
 29 | 		stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep	keeps",
 30 | 				"kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"]
 31 | 
 32 | 		for s in sentences:
 33 | 			chunked = nltk.ne_chunk(s, binary=True)
 34 | 			for n in chunked:
 35 | 				if isinstance(n, nltk.tree.Tree):
 36 | 					if n.label() == 'NE':
 37 | 						entities_all = self.getEntity(n, stop, entities_all, entity_description)
 38 | 
 39 | 		if entity_description:
 40 | 			return entities_all
 41 | 		else:
 42 | 			return list(set(entities_all))
 43 | 
 44 | 	def getEntity(self, n, stop, entities_all, entity_description=None):
 45 | 		entity = None
 46 | 
 47 | 		for c in n:
 48 | 			entity = c[0] if not entity else entity + " " + c[0]
 49 | 
 50 | 		entity_lower = entity.lower()
 51 | 		entity_lower = [i for i in [entity_lower] if i not in stop]
 52 | 
 53 | 		if entity_lower:
 54 | 			if entity_description:
 55 | 				entity_dbpedia = self.lookup_entity(entity)
 56 | 				entities_all[entity_dbpedia['name']] = entity_dbpedia
 57 | 			else:
 58 | 				entities_all.append(entity)
 59 | 
 60 | 		return entities_all
 61 | 
 62 | 	def lookup_entity(self, entity):
 63 | 		entity_dbpedia = {}
 64 | 		entity_dbpedia['name'] = entity
 65 | 		entity_dbpedia['categories'] = []
 66 | 		entity_dbpedia['classes'] = []
 67 | 		entity_dbpedia['description'] = None
 68 | 
 69 | 		headers = {
 70 | 			'content-type': 'application/json',
 71 | 			'Accept': 'application/json'
 72 | 		}
 73 | 
 74 | 		r = requests.get('http://lookup.dbpedia.org/api/search/PrefixSearch?MaxHits=2&QueryString=' + entity, headers=headers)
 75 | 
 76 | 		if r.status_code == 200:
 77 | 			r_json = r.json()
 78 | 			if r_json['results']:
 79 | 				try:
 80 | 					entity_dbpedia['description'] = r_json['results'][0]['description']
 81 | 				except KeyError, e:
 82 | 					pass
 83 | 
 84 | 				try:
 85 | 					entity_dbpedia['categories'] = r_json['results'][0]['categories'][0]
 86 | 				except KeyError, e:
 87 | 					pass
 88 | 
 89 | 				try:
 90 | 					entity_dbpedia['classes'] = r_json['results'][0]['classes']
 91 | 				except KeyError, e:
 92 | 					pass
 93 | 
 94 | 		return entity_dbpedia
 95 | 
 96 | if __name__ == '__main__':
 97 | 	e = Entities()
 98 | 	text = "Iain Duncan Smith has criticised the government's desperate search for savings in his first interview since resigning as work and pensions secretary."
 99 | 	print e.extract(text, entity_description=True)
100 | 


--------------------------------------------------------------------------------
/features/keywords.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import nltk
  3 | from nltk.stem.porter import PorterStemmer
  4 | from nltk.corpus import stopwords
  5 | from nltk.collocations import *
  6 | import numpy as np
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | import sys
  9 | import string
 10 | import cPickle
 11 | from bs4 import BeautifulSoup
 12 | from os import listdir
 13 | from os.path import isfile, join, split
 14 | import cProfile
 15 | import pstats
 16 | import tables
 17 | import numpy as np
 18 | import csv
 19 | from pprint import pprint
 20 | nltk.data.path = ['home/ubuntu/nltk_data', '/Users/joanfihu/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data' ,'/home/ubuntu/nltk_data']
 21 | 
 22 | 
 23 | class KeywordsExtractor(object):
 24 | 
 25 | 	num_kewyords = 0
 26 | 	data_path = './data/'
 27 | 	stemmer = PorterStemmer()
 28 | 	verbose = None
 29 | 
 30 | 	def __init__(self, num_kewyords=10, data_path='../data/', verbose=False):
 31 | 		self.num_kewyords = num_kewyords
 32 | 		self.data_path = data_path
 33 | 		self.verbose = verbose
 34 | 
 35 | 	def stem_tokens(self, tokens):
 36 | 		return [self.stemmer.stem(item) for item in tokens]
 37 | 
 38 | 	def tokenize(self, text):
 39 | 		soup = BeautifulSoup(text, 'html.parser')
 40 | 		text = soup.getText()
 41 | 		text = filter(lambda x: x in string.printable, text)
 42 | 		lowers = str(text).lower()
 43 | 		text = lowers.translate(None, string.punctuation)
 44 | 		tokens = nltk.word_tokenize(text)
 45 | 		stems = tokens
 46 | 		#stems = self.stem_tokens(tokens)
 47 | 		return stems
 48 | 
 49 | 	def tokenize2(self, text):
 50 | 		lowers = str(text).lower()
 51 | 		text = lowers.translate(None, string.punctuation)
 52 | 		return nltk.word_tokenize(text)
 53 | 
 54 | 	def get_bbc_news_corpus(self):
 55 | 		news_corpus = []
 56 | 		for news_type in ['business', 'entertainment', 'politics', 'sport', 'tech']:
 57 | 			mypath = self.data_path + 'bbc/' + news_type
 58 | 			onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
 59 | 			for file_name in onlyfiles:
 60 | 				f = open(self.data_path + 'bbc/' + news_type + '/' + file_name, 'r')
 61 | 
 62 | 				text = f.read().decode('utf-8', 'replace')
 63 | 
 64 | 				soup = BeautifulSoup(text, 'html.parser')
 65 | 				text = soup.getText()
 66 | 				text = filter(lambda x: x in string.printable, text)
 67 | 				lowers = str(text).lower()
 68 | 				text = lowers.translate(None, string.punctuation)
 69 | 
 70 | 				news_corpus.append(text)
 71 | 				f.close()
 72 | 
 73 | 		return news_corpus
 74 | 
 75 | 	def get_specifiedby_corpus(self):
 76 | 		specifiedby_corpus = []
 77 | 		with open(self.data_path + 'specifiedby_corpus.csv', 'rU') as csvfile:
 78 | 			reader = csv.reader(csvfile, delimiter=',', quotechar='"')
 79 | 			header = next(reader)
 80 | 
 81 | 			processed, failed = 0, 0
 82 | 			for row in reader:
 83 | 				if row:
 84 | 					try:
 85 | 						specifiedby_corpus.append(self.remove_non_ascii(row[0] + ' ' + row[1]))
 86 | 						processed += 1
 87 | 					except Exception, e:
 88 | 						failed += 1
 89 | 				else:
 90 | 					failed += 1
 91 | 
 92 | 		print "get_specifiedby_corpus - processed: {} failed {}".format(processed, failed)
 93 | 		return specifiedby_corpus
 94 | 		#return [" ".join(specifiedby_corpus)]
 95 | 
 96 | 	def remove_non_ascii(self, text):
 97 | 		"""
 98 | 			Removes non ascii characters by converting them to their integers 
 99 | 			and then remove anythin above ref 128
100 | 			Parameters :
101 | 			- text: <string> text to remove characters
102 | 		"""
103 | 		return ''.join([i if ord(i) < 128 else ' ' for i in text])
104 | 
105 | 	def train_tfidf(self, tokenizer='custom', corpus='news'):
106 | 
107 | 		if tokenizer == 'custom':
108 | 			#tokenizer = self.tokenize
109 | 			tokenizer = self.tokenize2
110 | 
111 | 		nltk_corpus = []
112 | 		if corpus == 'all':
113 | 			nltk_corpus += [nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids()]
114 | 			nltk_corpus += [nltk.corpus.webtext.raw(f_id) for f_id in nltk.corpus.webtext.fileids()]
115 | 			nltk_corpus += [nltk.corpus.brown.raw(f_id) for f_id in nltk.corpus.brown.fileids()]
116 | 			nltk_corpus += [nltk.corpus.reuters.raw(f_id) for f_id in nltk.corpus.reuters.fileids()]
117 | 		elif corpus == 'news':
118 | 			nltk_corpus += self.get_bbc_news_corpus()
119 | 			nltk_corpus += self.get_specifiedby_corpus()
120 | 
121 | 		if self.verbose:
122 | 			print "LENGTH nltk corpus corpus: {}".format(sum([len(d) for d in nltk_corpus]))
123 | 
124 | 
125 | 		vectorizer = TfidfVectorizer(
126 | 			max_df=0.5,
127 | 			min_df=150,
128 | 			encoding='utf-8',
129 | 			decode_error='strict',
130 | 			max_features=None,
131 | 			stop_words='english',
132 | 			ngram_range=(1, 3),
133 | 			norm='l2',
134 | 			tokenizer=tokenizer,
135 | 			analyzer='word',
136 | 			use_idf=True,
137 | 			sublinear_tf=False)
138 | 
139 | 		#vectorizer.fit_transform(nltk_corpus)
140 | 		vectorizer.fit(nltk_corpus)
141 | 		# Avoid having to pickle instance methods, we will set this method on on load
142 | 		vectorizer.tokenizer = None
143 | 		keys = np.array(vectorizer.vocabulary_.keys(), dtype=str)
144 | 		values = np.array(vectorizer.vocabulary_.values(), dtype=int)
145 | 		stop_words = np.array(list(vectorizer.stop_words_), dtype=str)
146 | 
147 | 		with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f:
148 | 			atom = tables.Atom.from_dtype(keys.dtype)
149 | 			ds = f.createCArray(f.root, 'keys', atom, keys.shape)
150 | 			ds[:] = keys
151 | 
152 | 		with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f:
153 | 			atom = tables.Atom.from_dtype(values.dtype)
154 | 			ds = f.createCArray(f.root, 'values', atom, values.shape)
155 | 			ds[:] = values
156 | 
157 | 		with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'w') as f:
158 | 			atom = tables.Atom.from_dtype(stop_words.dtype)
159 | 			ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape)
160 | 			ds[:] = stop_words
161 | 
162 | 		vectorizer.vocabulary_ = None
163 | 		vectorizer.stop_words_ = None
164 | 
165 | 		with open(self.data_path + 'tfidf.pkl', 'wb') as fin:
166 | 			cPickle.dump(vectorizer, fin)
167 | 
168 | 		vectorizer.vocabulary_ = dict(zip(keys, values))
169 | 		vectorizer.stop_words_ = stop_words
170 | 
171 | 		return vectorizer
172 | 
173 | 	def extract_bigrams(self, text):
174 | 
175 | 		text = self.remove_return_lines_and_quotes(text)
176 | 		bigrams = []
177 | 
178 | 		st = PorterStemmer()
179 | 		stop = stopwords.words('english')
180 | 
181 | 		more_stop_words = [
182 | 			'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
183 | 		stop = stopwords.words('english')
184 | 		stop = stop + more_stop_words
185 | 
186 | 		tokens = st.stem(text)
187 | 		tokens = nltk.word_tokenize(tokens.lower())
188 | 		tokens = [i for i in tokens if i not in stop]
189 | 		tokens = [word for word in tokens if len(word) > 2]
190 | 
191 | 		bigram_measures = nltk.collocations.BigramAssocMeasures()
192 | 		finder = BigramCollocationFinder.from_words(tokens)
193 | 		finder.apply_freq_filter(2)
194 | 		top_bigrams = finder.nbest(bigram_measures.pmi, 1000)
195 | 
196 | 		for bg in top_bigrams:
197 | 			bg = " ".join(bg)
198 | 			tag = nltk.pos_tag([bg])[0]
199 | 
200 | 			if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
201 | 				bigrams.append(tag[0])
202 | 
203 | 		return bigrams
204 | 
205 | 	def get_tfidf_model(self):
206 | 		with open(self.data_path + 'tfidf.pkl', 'rb') as pkl_file:
207 | 			vectorizer = cPickle.load(pkl_file)
208 | 
209 | 		vectorizer.tokenizer = self.tokenize
210 | 
211 | 		with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'r') as f:
212 | 			keys = f.root.keys.read()
213 | 
214 | 		with tables.openFile(self.data_path + 'tfidf_values.hdf', 'r') as f:
215 | 			values = f.root.values.read()
216 | 
217 | 		vectorizer.vocabulary_ = dict(zip(keys, values))
218 | 
219 | 		with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'r') as f:
220 | 			vectorizer.stop_words_ = set(f.root.stop_words.read())
221 | 
222 | 		return vectorizer
223 | 
224 | 	def remove_return_lines_and_quotes(self, text):
225 | 		text = text.replace('\n', ' ')
226 | 		text = text.replace('\t', ' ')
227 | 		text = text.replace('\r', ' ')
228 | 		text = text.replace('"', '')
229 | 		return text
230 | 
231 | 	def extract(self, documents=None, vectorizer=None, tokenizer='custom', tfidf_corpus='news'):
232 | 
233 | 		try:
234 | 			vectorizer = self.get_tfidf_model()
235 | 		except (EOFError, IOError), e:
236 | 			vectorizer = self.train_tfidf(tokenizer, tfidf_corpus)
237 | 
238 | 		docs = vectorizer.transform(documents)
239 | 
240 | 		feature_names = vectorizer.get_feature_names()
241 | 		features = []
242 | 		for i in xrange(docs.shape[0]):
243 | 
244 | 			sort_score_indices = np.argsort(docs[i, :].data)
245 | 			top_n_indices = self.num_kewyords if (len(sort_score_indices)) > self.num_kewyords else len(sort_score_indices)
246 | 			top_features_indices = []
247 | 
248 | 			if top_n_indices:
249 | 				top_features_indices = docs[i, :].indices[np.argsort(docs[i, :].data)[::-1][:top_n_indices]]
250 | 
251 | 			top_features_names = [feature_names[f] for f in top_features_indices]
252 | 
253 | 			# Extract most common bigrams. TFIDF gives more relevance to
254 | 			# unigrams than bigrams
255 | 			# bigrams = self.extract_bigrams(documents[i])
256 | 			# top_features_names = list(set(top_features_names + bigrams))
257 | 
258 | 			features.append(top_features_names)
259 | 
260 | 		return features
261 | 
262 | if __name__ == '__main__':
263 | 	k = KeywordsExtractor(num_kewyords=100, verbose=True, data_path='../data/')
264 | 	document = "Iain Duncan Smith has criticised the government's desperate search for savings in his first interview since resigning as work and pensions secretary."
265 | 	document = "High-quality, contemporary facing brick available with a smooth or textured finish. There are bricks. And there are bricks you can design with. If you're used to assuming a choice of one colour and one finish, why not choose a brick that can become part of your design process instead? The Oakland range of brick can add a spark to architectural designs - whether your next project is traditional, contemporary or avant-garde, choose from a broad range of precision facing bricks with dynamic colours that will help bring the final project to life. Oakland Brick is available in a range of 20 colour and texture combinations. LOW EFFLORESCENCE\r\n\r\nAG's brick range is free from soluble salt, meaning that Oakland Brick's levels of efflorescence are extremely low.\r\n\r\n\r\nCOMPLEMENTARY SPECIALS\r\n\r\nA range of complementary specials are available.\r\n\r\n\r\nBRE GREEN GUIDE 'A' RATED\r\n\r\nOakland Brick is produced in the UK from locally sourced materials and manufactured with 90 harvested rain water and 100 renewable energy in the production process.\n\nProperties: Smooth, Textured, 1, BS EN 7713, ISO 9001, ISO 14001, A+, F2, Frost Resistant, A1 Oakland Brick - A contemporary brick with clean, crisp lines"
266 | 	print k.extract(documents=[document])[0]
267 | 	# k.get_specifiedby_corpus()
268 | 	# cProfile.run("k.extract(documents=[document])[0]", 'restats')
269 | 	p = pstats.Stats('restats')
270 | 	p.sort_stats('cumulative').print_stats(30)
271 | 
272 | 
273 | 	#print k.train_tfidf()
274 | 	#print k.get_tfidf_model()
275 | 


--------------------------------------------------------------------------------
/features/main_text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from bs4 import BeautifulSoup, Comment
  3 | from sklearn import preprocessing
  4 | import numpy as np
  5 | from goose import Goose
  6 | from goose.article import Article
  7 | from goose.configuration import Configuration
  8 | from goose.cleaners import DocumentCleaner
  9 | from goose.extractors.content import ContentExtractor
 10 | from goose.extractors.images import ImageExtractor
 11 | from goose.outputformatters import OutputFormatter
 12 | 
 13 | 
 14 | class MainTextExtractor(object):
 15 | 
 16 |     @classmethod
 17 |     def _remove_chars(cls, text):
 18 |         stripped = text.strip(' \t\n\r')
 19 |         if not stripped:
 20 |             return None
 21 |         else:
 22 |             return stripped
 23 | 
 24 |     @classmethod
 25 |     def _main_paragraph_text(cls, html):
 26 |         """
 27 |         Method to extract the main content by only looking
 28 |         at the html p tag elements.
 29 |         """
 30 | 
 31 |         soup = BeautifulSoup(html, 'html.parser')
 32 |         min_length = 30
 33 |         feature_text = []
 34 | 
 35 |         for p in soup.find_all('p'):
 36 |             text_content = MainTextExtractor._remove_chars(p.get_text(strip=True))
 37 |             if text_content and (len(text_content) > min_length):
 38 |                 feature_text.append(text_content)
 39 | 
 40 |         return ' \n'.join(feature_text)
 41 | 
 42 |     @classmethod
 43 |     def removeCharacters(self, s):
 44 |         s = s.strip(' \t\n\r')
 45 |         if ((s == "") | (s==None)):
 46 |             return None
 47 |         return s
 48 | 
 49 |     @classmethod
 50 |     def _parse_tags(cls, html):
 51 | 
 52 |         excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta', 'header', 'footer',
 53 |                          'link', 'body', 'input', 'form', 'a']
 54 |         minimum_text_node_length = 8
 55 | 
 56 |         y_data = []
 57 |         text_data = []
 58 |         tag_signatures = []
 59 | 
 60 |         soup = BeautifulSoup(html, 'html.parser')
 61 | 
 62 |         for tag in soup.findAll():
 63 | 
 64 |             path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
 65 |             tag_signature = '.'.join([path, tag.name])
 66 | 
 67 |             if (tag.name not in excluded_tags) and ('table' not in path):
 68 | 
 69 |                 tag_text = []
 70 |                 for text in tag.contents:
 71 |                     if isinstance(text, Comment):
 72 |                         continue
 73 |                     try:
 74 |                         text = text.strip()
 75 |                         aux = BeautifulSoup(text, 'html.parser')
 76 |                         if aux.find() is None:
 77 |                             tag_text.append(text)
 78 |                     except Exception, e:
 79 |                         pass
 80 | 
 81 |                 tag_text = "\n".join(tag_text)
 82 | 
 83 |                 if tag_text and len(tag_text) > minimum_text_node_length:
 84 |                     if tag_text not in text_data:
 85 | 
 86 |                         # Remove line returns and tabs
 87 |                         tag_text = cls._remove_chars(tag_text)
 88 |                         if tag_text:
 89 |                             y_data.append(len(tag_text))
 90 |                             text_data.append(tag_text)
 91 |                             tag_signatures.append(path)
 92 | 
 93 |         x = np.array(y_data)
 94 |         return x, text_data, tag_signatures
 95 | 
 96 |     @classmethod
 97 |     def _find_intervals(cls, x):
 98 |         """
 99 |         The main content is ofteb located between two points l_ini and l_end.
100 |         This method aims to find two pointers where the distance between l_ini
101 |         and l_end is minimum and the number of characters between
102 |         these pointers is maximum.
103 |         """
104 |         x = np.array(x)
105 |         x_length = x.shape[0]
106 |         total = np.sum(x)
107 |         mean = np.mean(x)
108 | 
109 |         # Locate where the maximum is
110 |         max_pointer = np.argmax(x)
111 |         max_accum = 0
112 |         pointer_left, pointer_right = max_pointer, max_pointer
113 | 
114 |         # Find tags higher than average in the left neighbourhood until there
115 |         # is no hope
116 |         hope_left_state, hope_right_state = True, True
117 |         while pointer_left > 0 and hope_left_state:
118 |             # if the x[pointer_left - 1] is greater than the mean, move
119 |             # the pointer one to the left
120 |             if x[pointer_left - 1] > mean:
121 |                 pointer_left -= 1
122 |             else:
123 |                 # Is it worth to move to the left? If yes, there is hope.
124 |                 pointer_left_hope = pointer_left - 5
125 |                 if pointer_left_hope < 0:
126 |                     pointer_left_hope = 0
127 | 
128 |                 hope_left = x[pointer_left_hope:pointer_left][::-1]
129 |                 max_hope_left_value = np.max(hope_left)
130 |                 max_hope_left_idx = pointer_left - np.argmax(hope_left) - 1
131 | 
132 |                 if max_hope_left_value > mean:
133 |                     pointer_left = max_hope_left_idx
134 |                 else:
135 |                     hope_left_state = False
136 | 
137 |         # Same reasoning as previous one but for the right
138 |         while (pointer_right + 1) < x_length and hope_right_state:
139 |             if x[pointer_right + 1] > mean:
140 |                 pointer_right += 1
141 |             else:
142 |                 # Is it worth to move to the right?
143 |                 pointer_right_hope = pointer_right + 5
144 | 
145 |                 if pointer_right_hope > len(x):
146 |                     pointer_right_hope = len(x)
147 | 
148 |                 hope_right = x[(pointer_right + 1): pointer_right_hope]
149 |                 max_hope_right_value = np.max(hope_right)
150 |                 max_hope_right_idx = pointer_right + np.argmax(hope_right) + 1
151 | 
152 |                 if max_hope_right_value > mean:
153 |                     pointer_right = max_hope_right_idx
154 |                 else:
155 |                     hope_right_state = False
156 | 
157 |         # Find a cutoff pointer where the number of characters on the left is
158 |         # equal to the number of characters on the right.
159 |         # This is just for visualization
160 |         accumulated = 0
161 |         for i in xrange(0, x_length):
162 |             accumulated += x[i]
163 |             if accumulated >= (total / 2):
164 |                 cutoff_point = i
165 |                 break
166 | 
167 |         return pointer_left - 1, pointer_right + 1, cutoff_point, mean, max_pointer
168 | 
169 |     @classmethod
170 |     def _refine_intervals(cls, max_tag_signature, max_pointer, text_data, l_ini, l_end):
171 |         """
172 |         This method runs after findIntervals and intends to narrow down
173 |         where the left and right pointers are.
174 |         """
175 | 
176 |         max_tag_signature_parts = max_tag_signature[max_pointer].split('.')
177 |         tag_max_match = np.zeros(l_end)
178 | 
179 |         for i in xrange(l_ini, l_end):
180 |             tag_signature_aux_parts = max_tag_signature[i].split('.')
181 |             max_match = 0
182 | 
183 |             for j in range(0, len(max_tag_signature_parts)):
184 |                 try:
185 |                     if max_tag_signature_parts[j] == tag_signature_aux_parts[j]:
186 |                         max_match = j
187 |                     else:
188 |                         break
189 |                 except IndexError, e:
190 |                     break
191 | 
192 |             tag_max_match[i] = max_match
193 | 
194 |         tag_max_match = np.asarray(tag_max_match, dtype='float64')
195 |         min_max_scaler = preprocessing.MinMaxScaler()
196 |         tag_max_match = min_max_scaler.fit_transform(tag_max_match.reshape(-1, 1))
197 | 
198 |         tag_max_match = tag_max_match.reshape(1, -1)[0]
199 | 
200 |         return l_ini, l_end, tag_max_match
201 | 
202 |     @classmethod
203 |     def _combined_tags_text(cls, html):
204 | 
205 |         x, text_data, tag_signatures = MainTextExtractor._parse_tags(html)
206 |         # print x
207 |         if x.any():
208 |             l_ini, l_end, cutoff_point, mean, max_pointer = MainTextExtractor._find_intervals(x)
209 |             l_ini, l_end, tag_max_match = MainTextExtractor._refine_intervals(tag_signatures, max_pointer, text_data, l_ini, l_end)
210 | 
211 |             final_text = []
212 |             for i in xrange(l_ini, l_end):
213 |                 if tag_max_match[i] > 0.65:
214 |                     final_text.append(text_data[i])
215 | 
216 |             # This is only for debugging - Plot the html distribution
217 |             # import matplotlib.pyplot as plt
218 |             # mean_line = [mean] * x.shape[0]
219 |             # #std = np.std(y_data)
220 | 
221 |             # plt.figure(1)
222 |             # plt.subplot(211)
223 |             # plt.plot(x)
224 |             # plt.plot(mean_line)
225 |             # plt.axvline(x=cutoff_point,linewidth=2, color='purple')
226 |             # plt.axvline(x=l_ini,linewidth=2, color='r')
227 |             # plt.axvline(x=l_end,linewidth=2, color='r')
228 |             # plt.ylabel('Num Characters')
229 |             # plt.xlabel('Tag Location')
230 |             # plt.title('Content Distribution in HTML Page')
231 |             # plt.show()
232 | 
233 |             return '\n'.join(final_text)
234 |         else:
235 |             return None
236 | 
237 |     @classmethod
238 |     def _goose_cleaned_text(cls, html, page_html):
239 |         article = Article()
240 |         article.raw_html = html
241 |         article.raw_doc = page_html
242 |         article.doc = article.raw_doc
243 | 
244 |         goose_extractor = ContentExtractor(Configuration(), article)
245 |         goose_cleaner = DocumentCleaner(Configuration(), article)
246 |         goose_formatter = OutputFormatter(Configuration(), article)
247 |         # goose_image_extractor = ImageExtractor(Configuration(), article) use
248 | 
249 |         try:
250 |             article.doc = goose_cleaner.clean()
251 |             article.top_node = goose_extractor.calculate_best_node()
252 |             if article.top_node is not None:
253 |                 article.top_node = goose_extractor.post_cleanup()
254 |                 article.cleaned_text = goose_formatter.get_formatted_text()
255 |         except UnicodeDecodeError, e:
256 |             article.top_node = None
257 | 
258 |         return article.cleaned_text
259 | 
260 |     @classmethod
261 |     def extract(cls, html, page_html):
262 |         return [MainTextExtractor._goose_cleaned_text(html, page_html),
263 |                 MainTextExtractor._combined_tags_text(html),
264 |                 MainTextExtractor._main_paragraph_text(html)]
265 | 
266 | if __name__ == '__main__':
267 |     # this packages should be here but we only need the for improving the
268 |     # extractor therefore it might interfere with the rest of the project
269 |     mE = MainTextExtractor()
270 |     target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4'
271 |     article = Goose().extract(target_url)
272 |     print mE.extract(article.raw_html, article.raw_doc)


--------------------------------------------------------------------------------
/pyteaser_c.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from math import fabs
  3 | from re import split as regex_split, sub as regex_sub
  4 | #import nltk
  5 | #from nltk import FreqDist
  6 | #from nltk.book import *
  7 | 
  8 | 
  9 | stopWords = [
 10 |     "-", " ", ",", ".", "a", "e", "i", "o", "u", "t", "about", "above",
 11 |     "above", "across", "after", "afterwards", "again", "against", "all",
 12 |     "almost", "alone", "along", "already", "also", "although", "always",
 13 |     "am", "among", "amongst", "amoungst", "amount", "an", "and",
 14 |     "another", "any", "anyhow", "anyone", "anything", "anyway",
 15 |     "anywhere", "are", "around", "as", "at", "back", "be", "became",
 16 |     "because", "become", "becomes", "becoming", "been", "before",
 17 |     "beforehand", "behind", "being", "below", "beside", "besides",
 18 |     "between", "beyond", "both", "bottom", "but", "by", "call", "can",
 19 |     "cannot", "can't", "co", "con", "could", "couldn't", "de",
 20 |     "describe", "detail", "did", "do", "done", "down", "due", "during",
 21 |     "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
 22 |     "empty", "enough", "etc", "even", "ever", "every", "everyone",
 23 |     "everything", "everywhere", "except", "few", "fifteen", "fifty",
 24 |     "fill", "find", "fire", "first", "five", "for", "former",
 25 |     "formerly", "forty", "found", "four", "from", "front", "full",
 26 |     "further", "get", "give", "go", "got", "had", "has", "hasnt",
 27 |     "have", "he", "hence", "her", "here", "hereafter", "hereby",
 28 |     "herein", "hereupon", "hers", "herself", "him", "himself", "his",
 29 |     "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
 30 |     "into", "is", "it", "its", "it's", "itself", "just", "keep", "last",
 31 |     "latter", "latterly", "least", "less", "like", "ltd", "made", "make",
 32 |     "many", "may", "me", "meanwhile", "might", "mill", "mine", "more",
 33 |     "moreover", "most", "mostly", "move", "much", "must", "my", "myself",
 34 |     "name", "namely", "neither", "never", "nevertheless", "new", "next",
 35 |     "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing",
 36 |     "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
 37 |     "onto", "or", "other", "others", "otherwise", "our", "ours",
 38 |     "ourselves", "out", "over", "own", "part", "people", "per",
 39 |     "perhaps", "please", "put", "rather", "re", "said", "same", "see",
 40 |     "seem", "seemed", "seeming", "seems", "several", "she", "should",
 41 |     "show", "side", "since", "sincere", "six", "sixty", "so", "some",
 42 |     "somehow", "someone", "something", "sometime", "sometimes",
 43 |     "somewhere", "still", "such", "take", "ten", "than", "that", "the",
 44 |     "their", "them", "themselves", "then", "thence", "there",
 45 |     "thereafter", "thereby", "therefore", "therein", "thereupon",
 46 |     "these", "they", "thickv", "thin", "third", "this", "those",
 47 |     "though", "three", "through", "throughout", "thru", "thus", "to",
 48 |     "together", "too", "top", "toward", "towards", "twelve", "twenty",
 49 |     "two", "un", "under", "until", "up", "upon", "us", "use", "very",
 50 |     "via", "want", "was", "we", "well", "were", "what", "whatever",
 51 |     "when", "whence", "whenever", "where", "whereafter", "whereas",
 52 |     "whereby", "wherein", "whereupon", "wherever", "whether", "which",
 53 |     "while", "whither", "who", "whoever", "whole", "whom", "whose",
 54 |     "why", "will", "with", "within", "without", "would", "yet", "you",
 55 |     "your", "yours", "yourself", "yourselves", "the", "reuters", "news",
 56 |     "monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
 57 |     "sunday", "mon", "tue", "wed", "thu", "fri", "sat", "sun",
 58 |     "rappler", "rapplercom", "inquirer", "yahoo", "home", "sports",
 59 |     "1", "10", "2012", "sa", "says", "tweet", "pm", "home", "homepage",
 60 |     "sports", "section", "newsinfo", "stories", "story", "photo",
 61 |     "2013", "na", "ng", "ang", "year", "years", "percent", "ko", "ako",
 62 |     "yung", "yun", "2", "3", "4", "5", "6", "7", "8", "9", "0", "time",
 63 |     "january", "february", "march", "april", "may", "june", "july",
 64 |     "august", "september", "october", "november", "december",
 65 |     "philippine", "government", "police", "manila"
 66 | ]
 67 | ideal = 20.0
 68 | 
 69 | 
 70 | def SummarizeUrl(url):
 71 |     summaries = []
 72 |     try:
 73 |         article = grab_link(url)
 74 |     except IOError:
 75 |         print 'IOError'
 76 |         return None
 77 | 
 78 |     #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen
 79 |     if not article or not article.cleaned_text or not article.title:
 80 |         return None
 81 | 
 82 |     text = str(article.cleaned_text.encode('utf-8', 'ignore'))
 83 |     title = str(article.title.encode('utf-8', 'ignore'))
 84 |     print article.author
 85 |     summaries = Summarize(title, text)
 86 |     return summaries
 87 | 
 88 | def SummarizePage(html):
 89 |     summaries = []
 90 |     try:
 91 |         article = grab_page(html)
 92 |     except IOError:
 93 |         print 'IOError'
 94 |         return None
 95 | 
 96 |     #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen
 97 |     if not article or not article.cleaned_text or not article.title:
 98 |         return None
 99 | 
100 |     text = str(article.cleaned_text.encode('utf-8', 'ignore'))
101 |     title = str(article.title.encode('utf-8', 'ignore'))
102 |     summaries = Summarize(title, text)
103 |     return summaries
104 | 
105 | def GetArticle(html):
106 |     try:
107 |         article = grab_page(html)
108 |     except IOError:
109 |         print 'IOError'
110 |         return None
111 |     #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen
112 |     if not article or not article.cleaned_text or not article.title:
113 |         return None
114 | 
115 |     text = str(article.cleaned_text.encode('utf-8', 'ignore'))
116 |     title = str(article.title.encode('utf-8', 'ignore'))
117 |     return article
118 | 
119 | 
120 | def Summarize(title, text):
121 |     summaries = []
122 |     sentences = split_sentences(text)
123 |     #print sentences
124 |     keys = keywords(text)
125 |     titleWords = split_words(title)
126 | 
127 |     if len(sentences) <= 5:
128 |         return sentences
129 | 
130 |     #score setences, and use the top 5 sentences
131 |     ranks = score(sentences, titleWords, keys).most_common(5)
132 |     for rank in ranks:
133 |         summaries.append(rank[0])
134 | 
135 |     return summaries
136 | 
137 | 
138 | def grab_link(inurl):
139 |     #extract article information using Python Goose
140 |     from goose import Goose
141 |     try:
142 |         article = Goose().extract(url=inurl)
143 |         return article
144 |     except ValueError:
145 |         print 'Goose error grab'
146 |         return None
147 |     return None
148 | 
149 | def grab_page(html):
150 |     #extract article information using Python Goose
151 |     from goose import Goose
152 |     try:
153 |         article = Goose().extract_page(raw_html = html)
154 |         return article
155 |     except ValueError:
156 |         print 'Goose error grab'
157 |         return None
158 |     return None
159 | 
160 | 
161 | def score(sentences, titleWords, keywords):
162 |     #score sentences based on different features
163 | 
164 |     senSize = len(sentences)
165 |     ranks = Counter()
166 |     for i, s in enumerate(sentences):
167 |         sentence = split_words(s)
168 |         titleFeature = title_score(titleWords, sentence)
169 |         sentenceLength = length_score(sentence)
170 |         sentencePosition = sentence_position(i+1, senSize)
171 |         sbsFeature = sbs(sentence, keywords)
172 |         dbsFeature = dbs(sentence, keywords)
173 |         frequency = (sbsFeature + dbsFeature) / 2.0 * 10.0
174 | 
175 |         #weighted average of scores from four categories
176 |         totalScore = (titleFeature*1.5 + frequency*2.0 +
177 |                       sentenceLength*1.0 + sentencePosition*1.0) / 4.0
178 |         ranks[s] = totalScore
179 |     return ranks
180 | 
181 | 
182 | def sbs(words, keywords):
183 |     score = 0.0
184 |     if len(words) == 0:
185 |         return 0
186 |     for word in words:
187 |         if word in keywords:
188 |             score += keywords[word]
189 |     return (1.0 / fabs(len(words)) * score)/10.0
190 | 
191 | 
192 | def dbs(words, keywords):
193 |     if (len(words) == 0):
194 |         return 0
195 | 
196 |     summ = 0
197 |     first = []
198 |     second = []
199 | 
200 |     for i, word in enumerate(words):
201 |         if word in keywords:
202 |             score = keywords[word]
203 |             if first == []:
204 |                 first = [i, score]
205 |             else:
206 |                 second = first
207 |                 first = [i, score]
208 |                 dif = first[0] - second[0]
209 |                 summ += (first[1]*second[1]) / (dif ** 2)
210 | 
211 |     # number of intersections
212 |     k = len(set(keywords.keys()).intersection(set(words))) + 1
213 |     return (1/(k*(k+1.0))*summ)
214 | 
215 | 
216 | def split_words(text):
217 |     #split a string into array of words
218 |     try:
219 |         text = regex_sub(r'[^\w ]', '', text)  # strip special chars
220 |         return [x.strip('.').lower() for x in text.split()]
221 |     except TypeError:
222 |         return None
223 | 
224 | 
225 | def keywords(text):
226 |     #sentences = nltk.sent_tokenize(text)
227 |     #sentences = [nltk.word_tokenize(sent) for sent in sentences]    
228 |     #sentences = [nltk.pos_tag(sent) for sent in sentences]    
229 |     #print sentences
230 |     
231 |     #fdist1 =  FreqDist(text)
232 |     #print fdist1.most_common(50)    
233 |     #for i in sentences:
234 |     #    print i + "- \n -"
235 |     #    pass
236 |     #print "--"
237 |         
238 |     #sentences = [nltk.word_tokenize(sent) for sent in sentences] [2]
239 |     #sentences = [nltk.pos_tag(sent) for sent in sentences]
240 |     
241 |     """get the top 10 keywords and their frequency scores
242 |     ignores blacklisted words in stopWords,
243 |     counts the number of occurrences of each word,
244 |     and sorts them in reverse natural order (so descending)
245 |     by number of occurrences
246 |     """
247 |     from operator import itemgetter  # for sorting
248 |     text = split_words(text)
249 |     numWords = len(text)  # of words before removing blacklist words
250 |     text = [x for x in text if x not in stopWords]
251 |     freq = Counter()
252 |     for word in text:
253 |         freq[word] += 1
254 | 
255 |     minSize = min(10, len(freq))
256 |     keywords = tuple(freq.most_common(minSize))  # get first 10
257 |     keywords = dict((x, y) for x, y in keywords)  # recreate a dict
258 | 
259 |     for k in keywords:
260 |         articleScore = keywords[k]*1.0 / numWords
261 |         keywords[k] = articleScore * 1.5 + 1
262 | 
263 |     keywords = sorted(keywords.iteritems(), key=itemgetter(1))
264 |     keywords.reverse()
265 |     #print(keywords)
266 |     return dict(keywords)
267 | 
268 | 
269 | def split_sentences(text):
270 |     '''
271 |     The regular expression matches all sentence ending punctuation and splits the string at those points.
272 |     At this point in the code, the list looks like this ["Hello, world", "!" ... ]. The punctuation and all quotation marks
273 |     are separated from the actual text. The first s_iter line turns each group of two items in the list into a tuple,
274 |     excluding the last item in the list (the last item in the list does not need to have this performed on it). Then,
275 |     the second s_iter line combines each tuple in the list into a single item and removes any whitespace at the beginning
276 |     of the line. Now, the s_iter list is formatted correctly but it is missing the last item of the sentences list. The
277 |     second to last line adds this item to the s_iter list and the last line returns the full list.
278 |     '''
279 |     text = text.decode('utf-8')
280 |     sentences = regex_split('(?<![A-Z])([.!?]"?)(?=\s+\"?[A-Z])', text)
281 |     s_iter = zip(*[iter(sentences[:-1])] * 2)
282 |     s_iter = [''.join(map(unicode,y)).lstrip() for y in s_iter]
283 |     s_iter.append(sentences[-1])
284 |     return s_iter
285 | 
286 | 
287 | 
288 | def length_score(sentence):
289 |     return 1 - fabs(ideal - len(sentence)) / ideal
290 | 
291 | 
292 | def title_score(title, sentence):
293 |     title = [x for x in title if x not in stopWords]
294 |     count = 0.0
295 |     for word in sentence:
296 |         if (word not in stopWords and word in title):
297 |             count += 1.0
298 |     return count/len(title)
299 | 
300 | 
301 | def sentence_position(i, size):
302 |     """different sentence positions indicate different
303 |     probability of being an important sentence"""
304 | 
305 |     normalized = i*1.0 / size
306 |     if normalized > 0 and normalized <= 0.1:
307 |         return 0.17
308 |     elif normalized > 0.1 and normalized <= 0.2:
309 |         return 0.23
310 |     elif normalized > 0.2 and normalized <= 0.3:
311 |         return 0.14
312 |     elif normalized > 0.3 and normalized <= 0.4:
313 |         return 0.08
314 |     elif normalized > 0.4 and normalized <= 0.5:
315 |         return 0.05
316 |     elif normalized > 0.5 and normalized <= 0.6:
317 |         return 0.04
318 |     elif normalized > 0.6 and normalized <= 0.7:
319 |         return 0.06
320 |     elif normalized > 0.7 and normalized <= 0.8:
321 |         return 0.04
322 |     elif normalized > 0.8 and normalized <= 0.9:
323 |         return 0.04
324 |     elif normalized > 0.9 and normalized <= 1.0:
325 |         return 0.15
326 |     else:
327 |         return 0
328 | 


--------------------------------------------------------------------------------