├── __init__.py ├── features ├── __init__.py ├── table.py ├── files.py ├── sentiment.py ├── feeds.py ├── author.py ├── title.py ├── url2text.py ├── category.py ├── pypdf_to_image.py ├── file2text.py ├── images.py ├── entities.py ├── keywords.py └── main_text.py ├── requirements.txt ├── .gitignore ├── README.md ├── link.py └── pyteaser_c.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.1 2 | beautifulsoup4==4.4.1 3 | cssselect==0.9.1 4 | goose-extractor==1.0.25 5 | jieba==0.38 6 | langid==1.1.5 7 | lxml==3.6.0 8 | nltk==3.2 9 | numexpr==2.5 10 | numpy==1.10.4 11 | Pillow==3.1.1 12 | pyfscache==0.9.12 13 | requests==2.9.1 14 | scikit-learn==0.17.1 15 | scipy==0.17.0 16 | tables==3.2.2 17 | textblob==0.11.1 18 | treelib==1.3.2 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | #Other 65 | /data 66 | /cache 67 | /features/cache 68 | restats 69 | -------------------------------------------------------------------------------- /features/table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | from goose import Goose 4 | 5 | 6 | class TableExtractor(object): 7 | 8 | @classmethod 9 | def _get_tables(cls, html): 10 | """ 11 | Method to extract tables from html 12 | """ 13 | 14 | soup = BeautifulSoup(html, 'html.parser') 15 | return [t for t in soup.find_all('table')] 16 | 17 | @classmethod 18 | def extract(cls, html, page_html): 19 | 20 | soup = BeautifulSoup(html, 'html.parser') 21 | tables = [] 22 | excluded_tags = [ 23 | 'script', 'style', 'noscript', 'head', 'meta', 24 | 'header', 'footer', 'link', 'input', 'nav' 25 | ] 26 | 27 | [x.extract() for et in excluded_tags for x in soup.find_all(et) if x] 28 | 29 | for t in soup.find_all('table'): 30 | tables.append(t) 31 | 32 | return tables 33 | 34 | if __name__ == '__main__': 35 | # this packages should be here but we only need the for improving the 36 | # extractor therefore it might interfere with the rest of the project 37 | tE = TableExtractor() 38 | target_url = 'http://www.artisansofdevizes.com/product-collections/standard-tiles-flagstones/waldorf-limestone-collection-papyrus/' 39 | article = Goose().extract(target_url) 40 | print tE.extract(article.raw_html, article.raw_doc) -------------------------------------------------------------------------------- /features/files.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | from urlparse import urlparse 4 | import re 5 | 6 | 7 | class FilesExtractor(object): 8 | 9 | @classmethod 10 | def extract(cls, base_url, html): 11 | 12 | soup = BeautifulSoup(html, 'html.parser') 13 | pattern = re.compile('.*\.pdf|.*\.xls') 14 | base_url_parsed_netloc = urlparse(base_url).netloc.replace('www.', '') 15 | file_urls, context_text = [], [] 16 | 17 | for a in soup.find_all('a'): 18 | try: 19 | a_netloc = urlparse(a['href']).netloc.replace('www.', '') 20 | if a_netloc == base_url_parsed_netloc: 21 | 22 | a_title = a['title'] 23 | a_text = a.getText() 24 | matches = re.finditer(pattern, a['href']) 25 | 26 | for m in matches: 27 | 28 | # Seek context text 29 | txt = [] 30 | if a_title: 31 | txt.append(a_title) 32 | if a_text: 33 | txt.append(a_text) 34 | 35 | file_urls.append(a['href']) 36 | context_text.append(" ".join(txt)) 37 | 38 | except KeyError, e: 39 | continue 40 | 41 | return file_urls, context_text 42 | 43 | if __name__ == '__main__': 44 | from goose import Goose 45 | fE = FilesExtractor() 46 | # target_url = 'https://traditionalbrickandstone.co.uk/product/victoria-falls/' 47 | target_url = 'http://www.imperialhandmadebricks.co.uk/products/yellow-stock/' 48 | article = Goose().extract(target_url) 49 | print fE.extract(target_url, article.raw_html) 50 | -------------------------------------------------------------------------------- /features/sentiment.py: -------------------------------------------------------------------------------- 1 | from textblob import TextBlob 2 | 3 | 4 | def findSentiment(keywords): 5 | 6 | k_aux = {} 7 | for k in keywords: 8 | blob = TextBlob(k) 9 | k_aux[k] = {} 10 | 11 | if blob.sentiment.polarity < 0: 12 | k_aux[k]['word'] = 'negative' 13 | elif blob.sentiment.polarity > 0: 14 | k_aux[k]['word'] = 'positive' 15 | else: 16 | k_aux[k]['word'] = 'neutral' 17 | 18 | k_aux[k]['sentiment'] = blob.sentiment.polarity 19 | k_aux[k]['subjectivity'] = blob.sentiment.subjectivity 20 | keywords = k_aux 21 | 22 | return keywords 23 | 24 | def getSentimentText(text): 25 | item_aux = {} 26 | blob = TextBlob(text) 27 | 28 | if blob.sentiment.polarity < 0: 29 | item_aux['word'] = 'negative' 30 | elif blob.sentiment.polarity > 0: 31 | item_aux['word'] = 'positive' 32 | else: 33 | item_aux['word'] = 'neutral' 34 | 35 | item_aux['sentiment'] = blob.sentiment.polarity 36 | item_aux['subjectivity'] = blob.sentiment.subjectivity 37 | 38 | return item_aux 39 | 40 | if __name__ == '__main__': 41 | text = ''' 42 | The titular threat of The Blob has always struck me as the ultimate movie 43 | monster: an insatiably hungry, amoeba-like mass able to penetrate 44 | virtually any safeguard, capable of--as a doomed doctor chillingly 45 | describes it--"assimilating flesh on contact. 46 | Snide comparisons to gelatin be damned, it's a concept with the most 47 | devastating of potential consequences, not unlike the grey goo scenario 48 | proposed by technological theorists fearful of 49 | artificial intelligence run rampant. 50 | ''' 51 | print getSentimentText(text) -------------------------------------------------------------------------------- /features/feeds.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from urlparse import urlparse 5 | 6 | 7 | class FeedsExtractor(object): 8 | 9 | @classmethod 10 | def extract(self, url): 11 | rss = [] 12 | headers = {'Accept': ':text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip,deflate,sdch', 14 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'} 15 | 16 | r = requests.get(url, headers=headers) 17 | soup = BeautifulSoup(r.text, "html.parser") 18 | 19 | for a in soup.find_all(['a', 'link']): 20 | href = a.get('href') 21 | # find urls that contain .rss 22 | if href: 23 | if 'rss' in href: 24 | rss.append(href) 25 | elif '/feed' in href: 26 | rss.append(href) 27 | 28 | if not rss: 29 | feep_paths = ['feed', 'rss'] 30 | o = urlparse(url) 31 | clean_url = o.scheme + "://" + o.netloc 32 | 33 | for f in feep_paths: 34 | try_url = clean_url + "/" + f 35 | r = requests.get(try_url, headers=headers) 36 | if r.status_code == 200: 37 | rss.append(try_url) 38 | 39 | return self._clean_rss(rss, url) 40 | 41 | @classmethod 42 | def _clean_rss(self, rss, base_url): 43 | o_base = urlparse(base_url) 44 | for i, item in enumerate(rss): 45 | o = urlparse(item) 46 | scheme = o.scheme 47 | netloc = o.netloc 48 | if not o.scheme: 49 | scheme = o_base.scheme 50 | if not o.netloc: 51 | netloc = o_base.netloc 52 | 53 | rss[i] = scheme + "://" + netloc + o.path 54 | 55 | return list(set(rss)) 56 | 57 | if __name__ == '__main__': 58 | fE = FeedsExtractor() 59 | print ext_rss.extract('http://techcrunch.com') 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature Engineering 2 | The repository contains modules to extract features from text and web pages. The features can be uses as a training data for machine algorithms or to improve your applications. Some of the methods are geared towards news articles but they also work with other domains. If you are not a Python programmer or need to do feature engineering at a larger scale, you can use [the API](https://market.mashape.com/adlegant/article-analysis) 3 | 4 | ## Installation 5 | ``` 6 | git clone git@github.com:webeng/feature_engineering.git 7 | cd feature_engineering 8 | virtualenv env 9 | source env/bin/activate 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | Run `python link.py` to see an example. 14 | 15 | If you want fast keyword extraction, you will have to [install HDFS](http://www.hdfgroup.org/ftp/HDF5/current/src/unpacked/release_docs/INSTALL). Also, you might have to install pytables by running this command `sudo HDF5_DIR=/usr/local/hdf5/ pip install tables`. Also add /usr/local/hdf5/lib/ to LD_LIBRARY_PATH. I'll try develop a slower version without the HDFS. 16 | 17 | # Modules 18 | You can run each module individually to see examples. 19 | 20 | ## author.py 21 | Extracts the author of an article given a link. 22 | 23 | ## category.py 24 | Classify a document 25 | 26 | ## entities.py 27 | Named entity recognition. 28 | 29 | ## feeds.py 30 | Extracts feed urls given a link. 31 | 32 | ## images.py (to be added) 33 | Extracts images in a HTML document and ranks them by surface. 34 | 35 | ## main_text.py (to be added) 36 | Extracts the main text of page given a url. 37 | 38 | ## keywords.py 39 | Extracts main keywords in a text document using term frequency-inverse document frequency. 40 | 41 | ## sentiment.py 42 | Analyses the sentiment of a text or keyword. 43 | 44 | ## title.py 45 | Extracts page titles. 46 | -------------------------------------------------------------------------------- /features/author.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | from urlparse import urlparse 4 | 5 | 6 | class AuthorExtractor(object): 7 | 8 | @classmethod 9 | def clean_author(self, href, text=None): 10 | author = None 11 | if text: 12 | return text 13 | else: 14 | for part in href.split('/')[::-1]: 15 | if part not in ['', '/']: 16 | author = part.capitalize().replace('-', ' ') 17 | break 18 | 19 | return author 20 | 21 | @classmethod 22 | def extract(self, base_url, html): 23 | authors = [] 24 | soup = BeautifulSoup(html, 'html.parser') 25 | 26 | for a in soup.findAll('a'): 27 | href = a.get('href') 28 | 29 | if href: 30 | if re.search('.author.?/.', href) is not None: 31 | authors.append(self.clean_author(href, a.get_text())) 32 | elif re.search('.people/.', href) is not None: 33 | authors.append(self.clean_author(href, a.get_text())) 34 | elif (re.search('.user.?/.', href) is not None) & (re.search('.youtube.com.', href) is None): 35 | authors.append(self.clean_author(href, a.get_text())) 36 | elif re.search('.editor.?/.', href) is not None: 37 | authors.append(self.clean_author(href, a.get_text())) 38 | elif re.search('.contributor.?/.', href) is not None: 39 | authors.append(self.clean_author(href, a.get_text())) 40 | 41 | if not authors: 42 | author = None 43 | url_parse = urlparse(base_url) 44 | domain = url_parse.netloc.split('.') 45 | if len(domain) >= 2: 46 | author = domain[1] if domain[0] == 'www' else domain[0] 47 | 48 | authors.append(author.capitalize() + ' Staff') 49 | 50 | return authors 51 | 52 | if __name__ == '__main__': 53 | from goose import Goose 54 | aE = AuthorExtractor() 55 | target_url = 'http://www.wired.com/2016/03/1000-days-1000-surreal-posters-one-unfortunate-design/' 56 | article = Goose().extract(target_url) 57 | print aE.extract(target_url, article.raw_html) 58 | -------------------------------------------------------------------------------- /features/title.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from bs4 import BeautifulSoup 4 | from goose import Goose 5 | from goose.article import Article 6 | from goose.extractors.title import TitleExtractor as TitleExtractorGoose 7 | from goose.configuration import Configuration 8 | 9 | 10 | class TitleExtractor(object): 11 | 12 | SPLIT_CHARS = ['|', '–', '-'] 13 | 14 | def __init__(self): 15 | pass 16 | 17 | @classmethod 18 | def extract_text(cls, tag): 19 | if tag.string: 20 | return tag.string.strip().encode('utf-8', 'replace') 21 | return None 22 | 23 | @classmethod 24 | def _remove_duplicates_keep_order(cls, seq): 25 | seen = set() 26 | seen_add = seen.add 27 | return [x for x in seq if not (x in seen or seen_add(x))] 28 | 29 | @classmethod 30 | def extract(cls, html, html_formated): 31 | 32 | potential_titles = [] 33 | soup = BeautifulSoup(html, 'html.parser') 34 | 35 | if soup.title: 36 | page_title = TitleExtractor.extract_text(soup.title) 37 | 38 | for split_char in TitleExtractor.SPLIT_CHARS: 39 | if split_char in page_title: 40 | page_title = page_title.split(split_char)[0].strip() 41 | 42 | potential_titles.append(page_title) 43 | 44 | for heading_tag in (soup.find_all('h1') + soup.find_all('h2')): 45 | potential_title = TitleExtractor.extract_text(heading_tag) 46 | if potential_title: 47 | potential_titles.append(potential_title) 48 | 49 | # Extract article from goose 50 | article = Article() 51 | article.raw_html = html 52 | article.raw_doc = html_formated 53 | article.doc = article.raw_doc 54 | try: 55 | goose_title = TitleExtractorGoose(Configuration(), article).get_title() 56 | except AttributeError, e: 57 | goose_title = None 58 | 59 | return cls._remove_duplicates_keep_order(list(potential_titles + [goose_title])) 60 | # return list(set(potential_titles + [goose_title])) it doesn't preserve the order 61 | 62 | if __name__ == '__main__': 63 | 64 | tE = TitleExtractor() 65 | target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4' 66 | article = Goose().extract(target_url) 67 | print tE.extract(article.raw_html, article.raw_doc) 68 | -------------------------------------------------------------------------------- /features/url2text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import hashlib 3 | import requests 4 | from features.main_text import MainTextExtractor 5 | from lxml import html 6 | import pprint 7 | import re 8 | from features.file2text import File2Text 9 | 10 | 11 | class Url2Text(object): 12 | 13 | @classmethod 14 | def extract(cls, url, content_type=None): 15 | texts = [] 16 | file2text = File2Text() 17 | 18 | pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*') 19 | html_pattern = re.compile('.*text\/html.*') 20 | 21 | try: 22 | r = requests.get(url, timeout=30) 23 | except requests.exceptions.SSLError, e: 24 | r = requests.get(url, verify=False) 25 | 26 | if not content_type: 27 | content_type = r.headers['Content-Type'] 28 | 29 | print content_type 30 | 31 | matches_html = len(re.findall(html_pattern, content_type)) 32 | matches_pdf = len(re.findall(pdf_pattern, content_type)) 33 | 34 | if r.status_code == 200: 35 | if matches_html == 0: 36 | 37 | file_prefix = hashlib.md5(url).hexdigest() 38 | 39 | dst_path = './tmp/' 40 | 41 | dst = dst_path + file_prefix + '_' + url.split('/')[-1] 42 | 43 | with open(dst, 'wb') as f: 44 | for chunk in r.iter_content(1024): 45 | f.write(chunk) 46 | # call PDF2Text 47 | texts = [file2text.extract_all(dst)] 48 | else: 49 | texts = filter(None, MainTextExtractor.extract(r.text, html.fromstring(r.text))) 50 | 51 | return texts 52 | 53 | if __name__ == '__main__': 54 | # this packages should be here but we only need the for improving the 55 | # extractor therefore it might interfere with the rest of the project 56 | url2text = Url2Text() 57 | # PDF 58 | # target_url = "https://ocs.fas.harvard.edu/files/ocs/files/undergrad_resumes_and_cover_letters.pdf" 59 | target_url = 'http://www.artisansofdevizes.com/product-collections/standard-tiles-flagstones/waldorf-limestone-collection-papyrus/' 60 | # Image 61 | target_url = 'https://onepagelove-wpengine.netdna-ssl.com/wp-content/uploads/2016/10/opl-small-1.jpg' 62 | # Mp3 if it returns and error - Run brew install sox or sudo apt-get install sox 63 | target_url = 'http://www.noiseaddicts.com/samples_1w72b820/47.mp3' 64 | # article = Goose().extract(target_url) 65 | texts = url2text.extract(target_url) 66 | print texts 67 | # print tE.extract(article.raw_html, article.raw_doc) -------------------------------------------------------------------------------- /features/category.py: -------------------------------------------------------------------------------- 1 | from sklearn.externals import joblib 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | from sklearn.feature_extraction.text import TfidfTransformer 4 | import cPickle 5 | import pyfscache 6 | 7 | cache_it = pyfscache.FSCache('./cache', days=10, hours=12, minutes=30) 8 | 9 | 10 | class Classifier(object): 11 | 12 | def __init__(self, data_path='../data/'): 13 | self.data_path = data_path 14 | 15 | @cache_it 16 | def getModels(self): 17 | with open(self.data_path + '/categories.pkl', 'rb') as f: 18 | categories = cPickle.load(f) 19 | 20 | with open(self.data_path + '/category_map.pkl', 'rb') as f: 21 | category_map = cPickle.load(f) 22 | 23 | with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f: 24 | clf = cPickle.load(f) 25 | 26 | count_vect = CountVectorizer() 27 | with open(self.data_path + '/count_vect.pkl', 'rb') as f: 28 | count_vect = cPickle.load(f) 29 | 30 | tfidf_transformer = TfidfTransformer() 31 | with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f: 32 | tfidf_transformer = cPickle.load(f) 33 | 34 | with open(self.data_path + '/tree.pkl', 'rb') as f: 35 | tree = cPickle.load(f) 36 | 37 | return categories, category_map, clf, count_vect, tfidf_transformer, tree 38 | 39 | def predict(self, text): 40 | categories, category_map = [], [] 41 | categories, category_map, clf, count_vect, tfidf_transformer, tree = self.getModels() 42 | 43 | # tree.show() 44 | X_new_counts = count_vect.transform([text]) 45 | X_new_tfidf = tfidf_transformer.transform(X_new_counts) 46 | 47 | predicted = clf.predict(X_new_tfidf) 48 | 49 | predictions = [] 50 | for doc, cats in zip([text], predicted): 51 | if isinstance(cats, list): 52 | predictions += [categories[cat] for cat in cats] 53 | else: 54 | predictions.append(tree.get_node(cats).tag) 55 | 56 | return predictions 57 | 58 | if __name__ == '__main__': 59 | import cProfile 60 | import pstats 61 | clf = Classifier() 62 | text = 'Six Nations 2016: Wales 67-14 Italy Wales will finish second in the Six Nations after a record-breaking win over Italy. Warren Gatland team scored nine tries on their way to their biggest points total in a Championship game in Cardiff. Scrum-half Rhys Webb started the rout with the opening try within five minutes, and wing George North scored his fourth try in successive games. Dan Biggar also scored a try in a personal tally of 20 points. Replacement Ross Moriarty crossed twice as Wales won by a record margin of 53 points against the Italians - beating the 41-point mark set last year in Rome. Italy were completely outclassed, but crossed twice in the second half through scrum-half Guiglielmo Palazzini and centre Gonzalo Garcia. But for lacklustre first-half displays in the 16-16 draw with Ireland and the 25-21 loss to England, Wales could have been championship contenders. As it is, they will watch England - already crowned champions - go for a Grand Slam in Paris.' 63 | print clf.predict(text) 64 | 65 | cProfile.run("clf.predict(text)", 'restats') 66 | p = pstats.Stats('restats') 67 | p.sort_stats('cumulative').print_stats(30) 68 | -------------------------------------------------------------------------------- /features/pypdf_to_image.py: -------------------------------------------------------------------------------- 1 | """ 2 | Problem: 3 | How to Convert PDF to Image with Python Script ? 4 | 5 | Installation: 6 | I use ubuntu OS 14.04 7 | We use wrapper for ImageMagick [http://www.imagemagick.org/script/index.php] to Convert The PDF file 8 | in Python do: 9 | 10 | $ sudo apt-get install libmagickwand-dev 11 | $ pip install Wand 12 | 13 | now install PIL 14 | $ pip install Pillow 15 | 16 | More Installation http://sorry-wand.readthedocs.org/en/latest/guide/install.html 17 | more about wand https://pypi.python.org/pypi/Wand 18 | """ 19 | 20 | from PIL import Image as Img 21 | from wand.image import Image 22 | import uuid 23 | import numpy as np 24 | import glob 25 | import os 26 | import sys 27 | 28 | def convert(filepdf): 29 | #used to generate temp file name. so we will not duplicate or replace anything 30 | uuid_set = str(uuid.uuid4().fields[-1])[:5] 31 | try: 32 | #now lets convert the PDF to Image 33 | #this is good resolution As far as I know 34 | with Image(filename=filepdf, resolution=200) as img: 35 | #keep good quality 36 | img.compression_quality = 80 37 | #save it to tmp name 38 | img.save(filename="./data/temp%s.jpg" % uuid_set) 39 | except Exception, err: 40 | #always keep track the error until the code has been clean 41 | #print err 42 | print err 43 | return False 44 | else: 45 | """ 46 | We finally success to convert pdf to image. 47 | but image is not join by it self when we convert pdf files to image. 48 | now we need to merge all file 49 | """ 50 | pathsave = [] 51 | try: 52 | #search all image in temp path. file name ends with uuid_set value 53 | list_im = glob.glob("./data/temp%s*.jpg" % uuid_set) 54 | list_im.sort() #sort the file before joining it 55 | imgs = [Img.open(i) for i in list_im] 56 | #now lets Combine several images vertically with Python 57 | min_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[0][1] 58 | imgs_comb = np.vstack( 59 | (np.asarray(i.resize(min_shape)) for i in imgs)) 60 | # for horizontally change the vstack to hstack 61 | imgs_comb = Img.fromarray(imgs_comb) 62 | pathsave = "./data/my_pdf%s.jpg" % uuid_set 63 | #now save the image 64 | imgs_comb.save(pathsave) 65 | #and then remove all temp image 66 | for i in list_im: 67 | os.remove(i) 68 | except Exception, err: 69 | #print err 70 | return False 71 | return pathsave 72 | 73 | if __name__ == "__main__": 74 | arg = sys.argv[1] 75 | result = convert(arg) 76 | if result: 77 | print "[*] Succces convert %s and save it to %s" % (arg, result) 78 | else: 79 | print "[!] Whoops. something wrong dude. enable err var to track it" 80 | 81 | """ 82 | =========================================== 83 | Running Test: 84 | python testing-pdf.py zz.pdf 85 | [*] Succces convert zz.pdf and save it to Resume63245.jpg 86 | 87 | =========================================== 88 | """ 89 | #well I hope this will be useful for you & others. -------------------------------------------------------------------------------- /features/file2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 5 | from pdfminer.converter import TextConverter 6 | from pdfminer.layout import LAParams 7 | from pdfminer.pdfpage import PDFPage 8 | from cStringIO import StringIO 9 | import textract 10 | from features.pypdf_to_image import convert as convert_pdf_to_image 11 | from time import time 12 | import os 13 | 14 | 15 | class File2Text(object): 16 | """docstring for File2Text""" 17 | def __init__(self): 18 | super(File2Text, self).__init__() 19 | 20 | def extract(self, src, maxpages=0): 21 | rsrcmgr = PDFResourceManager() 22 | retstr = StringIO() 23 | codec = 'utf-8' 24 | laparams = LAParams(line_overlap=0.5, 25 | char_margin=2.0, 26 | line_margin=0.5, 27 | word_margin=0.1, 28 | boxes_flow=0.5, 29 | detect_vertical=True, 30 | all_texts=True) 31 | device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 32 | # device = TextConverter(rsrcmgr, retstr, codec=codec) 33 | fp = file(src, 'rb') 34 | interpreter = PDFPageInterpreter(rsrcmgr, device) 35 | password = "" 36 | caching = True 37 | pagenos = set() 38 | 39 | for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): 40 | interpreter.process_page(page) 41 | 42 | text = retstr.getvalue() 43 | 44 | fp.close() 45 | device.close() 46 | retstr.close() 47 | return text 48 | 49 | def extract_all(self, src, maxpages=0): 50 | if '.pdf' in src: 51 | try: 52 | start = time() 53 | text = self.extract(src, maxpages=maxpages) 54 | print "case 1 elapsed_time {}s".format(time() - start) 55 | except Exception, e: 56 | start = time() 57 | text = textract.process(src) 58 | print "case 2 elapsed_time {}s".format(time() - start) 59 | 60 | else: 61 | # TODO: allow other formats 62 | # return None 63 | start = time() 64 | text = textract.process(src) 65 | print "case 3 elapsed_time {}s".format(time() - start) 66 | 67 | 68 | # if text and len(text.strip()) == 0: 69 | # text = None 70 | 71 | if not text or len(text) < 10: 72 | # TODO: Speed this process up 73 | # return None 74 | print "...attempting convert_pdf_to_image" 75 | start = time() 76 | pdf_path = convert_pdf_to_image(src) 77 | text = textract.process(pdf_path) 78 | os.remove(pdf_path) 79 | print "case 4 elapsed_time {}s".format(time() - start) 80 | 81 | return text 82 | 83 | if __name__ == '__main__': 84 | src = '/Volumes/FLUFFUSHFS/Datasets/product_properties/data/documents/105293334418129088138c7cf90dacf7_hush-acoustics_Hush-Panel-32_Specifications_NR282-12-Hush-Panel-32.pdf' 85 | # src = '/Volumes/FLUFFUSHFS/Datasets/product_properties/data/documents/0a3a09b2ddb9615ba06cb2f7b812a3b4_ruukki-uk_C-purlin_Technical-Files_LP-IN05-EN.pdf' 86 | #print File2Text.extract(src) 87 | pdf2text = File2Text() 88 | # print File2Text.extract(src) 89 | print pdf2text.extract_all(src) 90 | # print convert_pdf_to_txt(src) 91 | -------------------------------------------------------------------------------- /features/images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | from urlparse import urlparse 4 | import urllib 5 | import cStringIO 6 | from PIL import Image 7 | 8 | 9 | class ImagesExtractor(object): 10 | 11 | @classmethod 12 | def _normalise_url(cls, base_url, url): 13 | """ 14 | Normalises a relative url to an absolute one if base domain is not 15 | present. 16 | """ 17 | parsed_url = urlparse(url) 18 | 19 | if not parsed_url.netloc: 20 | return base_url + '/'.join([segment 21 | for segment in parsed_url.path.split('/') 22 | if segment not in ['..', '.', '', None]]) + '?' + parsed_url.query 23 | elif not parsed_url.scheme: 24 | return 'http:' + url 25 | 26 | return url 27 | 28 | @classmethod 29 | def _process_image(cls, base_url, img_url): 30 | return ImagesExtractor._normalise_url(base_url, img_url) 31 | 32 | @classmethod 33 | def _get_zoom_image(cls, img): 34 | if img.parent.get('href'): 35 | return img.parent.get('href') 36 | else: # if not href, try data attributes 37 | for key, value in img.parent.attrs.iteritems(): 38 | if key.startswith('data-'): 39 | if value.startswith('http://'): 40 | return value 41 | return None 42 | 43 | @classmethod 44 | def _get_meta_image(cls, meta_tag): 45 | return meta_tag.get('content') 46 | 47 | @classmethod 48 | def select_top_image(cls, images): 49 | max_surface = 0 50 | selected = None 51 | for img_src in images: 52 | # print img_src 53 | try: 54 | file = cStringIO.StringIO(urllib.urlopen(img_src).read()) 55 | except UnicodeDecodeError, e: 56 | print e 57 | continue 58 | except IOError, e: 59 | print e 60 | continue 61 | 62 | try: 63 | im = Image.open(file) 64 | except IOError, e: 65 | continue 66 | width, height = im.size 67 | if width > 100 and height > 100: 68 | surface = (width * height) / 2 69 | if surface > max_surface: 70 | selected = img_src 71 | max_surface = surface 72 | 73 | return selected 74 | 75 | @classmethod 76 | def rank(cls, images): 77 | images_aux = [] 78 | 79 | for img_src in images: 80 | try: 81 | file = cStringIO.StringIO(urllib.urlopen(img_src).read()) 82 | im = Image.open(file) 83 | except UnicodeDecodeError, e: 84 | continue 85 | except IOError, e: 86 | continue 87 | 88 | width, height = im.size 89 | if width > 100 and height > 100: 90 | surface = (width * height) / 2 91 | images_aux.append([img_src, surface]) 92 | 93 | images_aux.sort(key=lambda x: x[1], reverse=True) 94 | 95 | return images_aux 96 | 97 | @classmethod 98 | def extract(cls, base_url, html): 99 | soup = BeautifulSoup(html, 'html.parser') 100 | 101 | img_tag_urls = filter(None, [img.get('src') for img in soup.find_all('img')]) 102 | zoom_img_urls = [] # zoom_img_urls = filter(None, [ImagesExtractor._get_zoom_image(img) for img in soup.select('a > img')]) 103 | meta_img_urls = filter(None, [ImagesExtractor._get_meta_image(mtag) for mtag in soup.select('meta[property=og:image]')]) 104 | image_urls = img_tag_urls + zoom_img_urls + meta_img_urls 105 | 106 | return [ImagesExtractor._process_image(base_url, image_url) for image_url in image_urls] 107 | 108 | if __name__ == '__main__': 109 | from goose import Goose 110 | iE = ImagesExtractor() 111 | # target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4' 112 | target_url = 'https://www.trilux.com/products/en/Indoor-lighting/Continuous-line-luminaires-and-batten-luminaires/E-Line-LED-IP20-54-rapid-mounting-continuous-line/?retainFilter=true' 113 | article = Goose().extract(target_url) 114 | print iE.extract(target_url, article.raw_html) 115 | -------------------------------------------------------------------------------- /link.py: -------------------------------------------------------------------------------- 1 | from features.title import TitleExtractor 2 | from features.main_text import MainTextExtractor 3 | from features.images import ImagesExtractor 4 | from features.sentiment import getSentimentText, findSentiment 5 | from features.keywords import KeywordsExtractor 6 | from features.entities import Entities 7 | from features.author import AuthorExtractor 8 | from features.category import Classifier 9 | from features.url2text import Url2Text 10 | from goose import Goose 11 | from lxml import etree 12 | from pyteaser_c import Summarize 13 | from pyteaser_c import SummarizePage 14 | from pyteaser_c import GetArticle 15 | from pyteaser_c import keywords 16 | from textblob import TextBlob 17 | import langid 18 | from bs4 import BeautifulSoup 19 | #import lxml.html 20 | from lxml import html 21 | import requests 22 | import pprint 23 | import re 24 | import os 25 | 26 | 27 | class NoMainTextException(Exception): 28 | pass 29 | 30 | 31 | class Link(object): 32 | # def is_html(self): 33 | # pass 34 | 35 | @classmethod 36 | def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'): 37 | errors, summaries, categories, entities, keywords = [], [], [], [], [] 38 | pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*') 39 | html_pattern = re.compile('.*text\/html.*') 40 | 41 | article = Goose().extract(link) 42 | 43 | content_type = article.__dict__['additional_data']['result'].info()['content-type'] 44 | matches_html = len(re.findall(html_pattern, content_type)) 45 | matches_pdf = len(re.findall(pdf_pattern, content_type)) 46 | 47 | if matches_html == 0: 48 | # Textract 49 | url2text = Url2Text() 50 | texts = url2text.extract(link) 51 | 52 | k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) 53 | ent = Entities() 54 | clf = Classifier(data_path=data_path) 55 | 56 | return { 57 | "title": os.path.basename(link), 58 | "link": link, 59 | "author": [], 60 | "cleaned_text": texts[0], 61 | "text_sentiment": getSentimentText(texts[0]), 62 | "main_body": None, 63 | "images": None, 64 | "image": None, 65 | "date": article.__dict__['additional_data']['result'].info()['last-modified'], 66 | "tags": k.extract([texts[0]], None, None, 'news')[0], 67 | "entities": ent.extract(texts[0], entity_description), 68 | "language": langid.classify(texts[0])[0], 69 | "summary": Summarize(None, texts[0]), 70 | "categories": clf.predict(texts[0]) 71 | } 72 | pass 73 | else: 74 | 75 | valid_html = bool(BeautifulSoup(article.raw_html[0:100], "html.parser").find()) 76 | 77 | if not valid_html: 78 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'} 79 | r = requests.get(link, headers=headers) 80 | article.raw_html = r.text 81 | article.raw_doc = html.fromstring(r.text) 82 | 83 | if article.raw_doc is None: 84 | raise NoMainTextException 85 | 86 | authors = AuthorExtractor.extract(link, article.raw_html) 87 | publish_date = article.publish_date if article.publish_date else None 88 | 89 | if not article.title: 90 | article.title = TitleExtractor.extract( 91 | article.raw_html, article.raw_doc)[0] 92 | 93 | k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) 94 | 95 | if article.top_node is not None: 96 | main_body = etree.tostring(article.top_node) 97 | else: 98 | cleant_text_suggestions = MainTextExtractor.extract(article.raw_html, article.raw_doc) 99 | article.cleaned_text = cleant_text_suggestions[1] 100 | if not article.cleaned_text: 101 | article.cleaned_text = cleant_text_suggestions[2] 102 | if not article.cleaned_text: 103 | raise NoMainTextException 104 | main_body = 'Sorry, we could not detect the main HTML content for this article' 105 | 106 | try: 107 | summaries = Summarize( 108 | article.title, article.cleaned_text.encode('utf-8', 'ignore')) 109 | except Exception, e: 110 | summaries.append('We could not make summaries at this time.') 111 | 112 | try: 113 | text_sentiment = getSentimentText(article.cleaned_text) 114 | except Exception, e: 115 | text_sentiment = None 116 | text = article.title + " " + article.cleaned_text 117 | keywords = k.extract([text], None, None, 'news')[0] 118 | 119 | # Get keywords from meta tag 120 | if not keywords: 121 | keywords = article.meta_keywords.split(',') 122 | 123 | # Get keywords from Goose 124 | if not keywords: 125 | keywords = [t for t in article.tags] 126 | 127 | if sentiment: 128 | keywords = findSentiment(keywords) 129 | 130 | ent = Entities() 131 | try: 132 | entities = ent.extract(text, entity_description) 133 | except Exception, e: 134 | entities.append('We could not extract entities at this time.') 135 | 136 | if sentiment: 137 | entities = findSentiment(entities) 138 | 139 | language = article.meta_lang 140 | 141 | if not language: 142 | language = langid.classify(article.cleaned_text)[0] 143 | 144 | if language in ['en', 'eo']: 145 | clf = Classifier(data_path=data_path) 146 | article.categories = clf.predict(text) 147 | else: 148 | article.categories = ["Article classification not ready for: " + language[0]] 149 | 150 | images = ImagesExtractor.extract(link, article.raw_html) 151 | 152 | if article.top_image: 153 | thumbnail = article.top_image.src 154 | else: 155 | #thumbnail = images[0] if images else None 156 | thumbnail = ImagesExtractor.select_top_image(images[0:50]) 157 | 158 | return { 159 | "title": article.title, 160 | "link": article.final_url, 161 | "author": authors, 162 | "cleaned_text": article.cleaned_text, 163 | "text_sentiment": text_sentiment, 164 | "main_body": main_body, 165 | "images": images, 166 | "image": thumbnail, 167 | "date": article.publish_date, 168 | "tags": keywords, 169 | "entities": entities, 170 | "language": language, 171 | "summary": summaries, 172 | "categories": article.categories 173 | } 174 | 175 | if __name__ == '__main__': 176 | import pprint 177 | l = Link() 178 | url = 'https://www.wired.com/2017/05/google-just-made-email-heckuva-lot-easier-deal/' 179 | # l = l.extract('http://techcrunch.com/2016/03/18/twitter-says-few-users-have-opted-out-of-its-new-algorithmic-timeline/') 180 | #l = l.extract('https://www.wired.com/2017/05/google-just-made-email-heckuva-lot-easier-deal/') 181 | # l = l.extract('http://www.independent.co.uk/life-style/gadgets-and-tech/features/google-lens-ai-preview-features-so-impressive-its-scary-a7745686.html') 182 | # l = l.extract('https://onepagelove-wpengine.netdna-ssl.com/wp-content/uploads/2016/10/opl-small-1.jpg') 183 | target_url = 'http://www.noiseaddicts.com/samples_1w72b820/47.mp3' 184 | l = l.extract(target_url) 185 | 186 | pprint.pprint(l) 187 | # import requests 188 | # r = requests.get(url) 189 | # print r.text 190 | -------------------------------------------------------------------------------- /features/entities.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import requests 3 | 4 | 5 | class Entities(object): 6 | 7 | def remove_return_lines_and_quotes(self, text): 8 | text = text.replace('\n', ' ') 9 | text = text.replace('\t', ' ') 10 | text = text.replace('\r', ' ') 11 | text = text.replace('"', '') 12 | return text 13 | 14 | def extract(self, text, entity_description=False): 15 | # We need to clean the text in each method otherwise when we present it 16 | # to the user, it will have a different format 17 | text = self.remove_return_lines_and_quotes(text) 18 | sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)] 19 | 20 | # This function is quite expensive 21 | sentences = [nltk.pos_tag(sent) for sent in sentences] 22 | 23 | entities_all = {} if entity_description else [] 24 | 25 | #stop = stopwords.words('english') 26 | # more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence'] 27 | # stop = stopwords.words('english') 28 | # stop = stop + more_stop_words 29 | stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep keeps", 30 | "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"] 31 | 32 | for s in sentences: 33 | chunked = nltk.ne_chunk(s, binary=True) 34 | for n in chunked: 35 | if isinstance(n, nltk.tree.Tree): 36 | if n.label() == 'NE': 37 | entities_all = self.getEntity(n, stop, entities_all, entity_description) 38 | 39 | if entity_description: 40 | return entities_all 41 | else: 42 | return list(set(entities_all)) 43 | 44 | def getEntity(self, n, stop, entities_all, entity_description=None): 45 | entity = None 46 | 47 | for c in n: 48 | entity = c[0] if not entity else entity + " " + c[0] 49 | 50 | entity_lower = entity.lower() 51 | entity_lower = [i for i in [entity_lower] if i not in stop] 52 | 53 | if entity_lower: 54 | if entity_description: 55 | entity_dbpedia = self.lookup_entity(entity) 56 | entities_all[entity_dbpedia['name']] = entity_dbpedia 57 | else: 58 | entities_all.append(entity) 59 | 60 | return entities_all 61 | 62 | def lookup_entity(self, entity): 63 | entity_dbpedia = {} 64 | entity_dbpedia['name'] = entity 65 | entity_dbpedia['categories'] = [] 66 | entity_dbpedia['classes'] = [] 67 | entity_dbpedia['description'] = None 68 | 69 | headers = { 70 | 'content-type': 'application/json', 71 | 'Accept': 'application/json' 72 | } 73 | 74 | r = requests.get('http://lookup.dbpedia.org/api/search/PrefixSearch?MaxHits=2&QueryString=' + entity, headers=headers) 75 | 76 | if r.status_code == 200: 77 | r_json = r.json() 78 | if r_json['results']: 79 | try: 80 | entity_dbpedia['description'] = r_json['results'][0]['description'] 81 | except KeyError, e: 82 | pass 83 | 84 | try: 85 | entity_dbpedia['categories'] = r_json['results'][0]['categories'][0] 86 | except KeyError, e: 87 | pass 88 | 89 | try: 90 | entity_dbpedia['classes'] = r_json['results'][0]['classes'] 91 | except KeyError, e: 92 | pass 93 | 94 | return entity_dbpedia 95 | 96 | if __name__ == '__main__': 97 | e = Entities() 98 | text = "Iain Duncan Smith has criticised the government's desperate search for savings in his first interview since resigning as work and pensions secretary." 99 | print e.extract(text, entity_description=True) 100 | -------------------------------------------------------------------------------- /features/keywords.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import nltk 3 | from nltk.stem.porter import PorterStemmer 4 | from nltk.corpus import stopwords 5 | from nltk.collocations import * 6 | import numpy as np 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | import sys 9 | import string 10 | import cPickle 11 | from bs4 import BeautifulSoup 12 | from os import listdir 13 | from os.path import isfile, join, split 14 | import cProfile 15 | import pstats 16 | import tables 17 | import numpy as np 18 | import csv 19 | from pprint import pprint 20 | nltk.data.path = ['home/ubuntu/nltk_data', '/Users/joanfihu/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data' ,'/home/ubuntu/nltk_data'] 21 | 22 | 23 | class KeywordsExtractor(object): 24 | 25 | num_kewyords = 0 26 | data_path = './data/' 27 | stemmer = PorterStemmer() 28 | verbose = None 29 | 30 | def __init__(self, num_kewyords=10, data_path='../data/', verbose=False): 31 | self.num_kewyords = num_kewyords 32 | self.data_path = data_path 33 | self.verbose = verbose 34 | 35 | def stem_tokens(self, tokens): 36 | return [self.stemmer.stem(item) for item in tokens] 37 | 38 | def tokenize(self, text): 39 | soup = BeautifulSoup(text, 'html.parser') 40 | text = soup.getText() 41 | text = filter(lambda x: x in string.printable, text) 42 | lowers = str(text).lower() 43 | text = lowers.translate(None, string.punctuation) 44 | tokens = nltk.word_tokenize(text) 45 | stems = tokens 46 | #stems = self.stem_tokens(tokens) 47 | return stems 48 | 49 | def tokenize2(self, text): 50 | lowers = str(text).lower() 51 | text = lowers.translate(None, string.punctuation) 52 | return nltk.word_tokenize(text) 53 | 54 | def get_bbc_news_corpus(self): 55 | news_corpus = [] 56 | for news_type in ['business', 'entertainment', 'politics', 'sport', 'tech']: 57 | mypath = self.data_path + 'bbc/' + news_type 58 | onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] 59 | for file_name in onlyfiles: 60 | f = open(self.data_path + 'bbc/' + news_type + '/' + file_name, 'r') 61 | 62 | text = f.read().decode('utf-8', 'replace') 63 | 64 | soup = BeautifulSoup(text, 'html.parser') 65 | text = soup.getText() 66 | text = filter(lambda x: x in string.printable, text) 67 | lowers = str(text).lower() 68 | text = lowers.translate(None, string.punctuation) 69 | 70 | news_corpus.append(text) 71 | f.close() 72 | 73 | return news_corpus 74 | 75 | def get_specifiedby_corpus(self): 76 | specifiedby_corpus = [] 77 | with open(self.data_path + 'specifiedby_corpus.csv', 'rU') as csvfile: 78 | reader = csv.reader(csvfile, delimiter=',', quotechar='"') 79 | header = next(reader) 80 | 81 | processed, failed = 0, 0 82 | for row in reader: 83 | if row: 84 | try: 85 | specifiedby_corpus.append(self.remove_non_ascii(row[0] + ' ' + row[1])) 86 | processed += 1 87 | except Exception, e: 88 | failed += 1 89 | else: 90 | failed += 1 91 | 92 | print "get_specifiedby_corpus - processed: {} failed {}".format(processed, failed) 93 | return specifiedby_corpus 94 | #return [" ".join(specifiedby_corpus)] 95 | 96 | def remove_non_ascii(self, text): 97 | """ 98 | Removes non ascii characters by converting them to their integers 99 | and then remove anythin above ref 128 100 | Parameters : 101 | - text: text to remove characters 102 | """ 103 | return ''.join([i if ord(i) < 128 else ' ' for i in text]) 104 | 105 | def train_tfidf(self, tokenizer='custom', corpus='news'): 106 | 107 | if tokenizer == 'custom': 108 | #tokenizer = self.tokenize 109 | tokenizer = self.tokenize2 110 | 111 | nltk_corpus = [] 112 | if corpus == 'all': 113 | nltk_corpus += [nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids()] 114 | nltk_corpus += [nltk.corpus.webtext.raw(f_id) for f_id in nltk.corpus.webtext.fileids()] 115 | nltk_corpus += [nltk.corpus.brown.raw(f_id) for f_id in nltk.corpus.brown.fileids()] 116 | nltk_corpus += [nltk.corpus.reuters.raw(f_id) for f_id in nltk.corpus.reuters.fileids()] 117 | elif corpus == 'news': 118 | nltk_corpus += self.get_bbc_news_corpus() 119 | nltk_corpus += self.get_specifiedby_corpus() 120 | 121 | if self.verbose: 122 | print "LENGTH nltk corpus corpus: {}".format(sum([len(d) for d in nltk_corpus])) 123 | 124 | 125 | vectorizer = TfidfVectorizer( 126 | max_df=0.5, 127 | min_df=150, 128 | encoding='utf-8', 129 | decode_error='strict', 130 | max_features=None, 131 | stop_words='english', 132 | ngram_range=(1, 3), 133 | norm='l2', 134 | tokenizer=tokenizer, 135 | analyzer='word', 136 | use_idf=True, 137 | sublinear_tf=False) 138 | 139 | #vectorizer.fit_transform(nltk_corpus) 140 | vectorizer.fit(nltk_corpus) 141 | # Avoid having to pickle instance methods, we will set this method on on load 142 | vectorizer.tokenizer = None 143 | keys = np.array(vectorizer.vocabulary_.keys(), dtype=str) 144 | values = np.array(vectorizer.vocabulary_.values(), dtype=int) 145 | stop_words = np.array(list(vectorizer.stop_words_), dtype=str) 146 | 147 | with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f: 148 | atom = tables.Atom.from_dtype(keys.dtype) 149 | ds = f.createCArray(f.root, 'keys', atom, keys.shape) 150 | ds[:] = keys 151 | 152 | with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f: 153 | atom = tables.Atom.from_dtype(values.dtype) 154 | ds = f.createCArray(f.root, 'values', atom, values.shape) 155 | ds[:] = values 156 | 157 | with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'w') as f: 158 | atom = tables.Atom.from_dtype(stop_words.dtype) 159 | ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape) 160 | ds[:] = stop_words 161 | 162 | vectorizer.vocabulary_ = None 163 | vectorizer.stop_words_ = None 164 | 165 | with open(self.data_path + 'tfidf.pkl', 'wb') as fin: 166 | cPickle.dump(vectorizer, fin) 167 | 168 | vectorizer.vocabulary_ = dict(zip(keys, values)) 169 | vectorizer.stop_words_ = stop_words 170 | 171 | return vectorizer 172 | 173 | def extract_bigrams(self, text): 174 | 175 | text = self.remove_return_lines_and_quotes(text) 176 | bigrams = [] 177 | 178 | st = PorterStemmer() 179 | stop = stopwords.words('english') 180 | 181 | more_stop_words = [ 182 | '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...'] 183 | stop = stopwords.words('english') 184 | stop = stop + more_stop_words 185 | 186 | tokens = st.stem(text) 187 | tokens = nltk.word_tokenize(tokens.lower()) 188 | tokens = [i for i in tokens if i not in stop] 189 | tokens = [word for word in tokens if len(word) > 2] 190 | 191 | bigram_measures = nltk.collocations.BigramAssocMeasures() 192 | finder = BigramCollocationFinder.from_words(tokens) 193 | finder.apply_freq_filter(2) 194 | top_bigrams = finder.nbest(bigram_measures.pmi, 1000) 195 | 196 | for bg in top_bigrams: 197 | bg = " ".join(bg) 198 | tag = nltk.pos_tag([bg])[0] 199 | 200 | if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']: 201 | bigrams.append(tag[0]) 202 | 203 | return bigrams 204 | 205 | def get_tfidf_model(self): 206 | with open(self.data_path + 'tfidf.pkl', 'rb') as pkl_file: 207 | vectorizer = cPickle.load(pkl_file) 208 | 209 | vectorizer.tokenizer = self.tokenize 210 | 211 | with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'r') as f: 212 | keys = f.root.keys.read() 213 | 214 | with tables.openFile(self.data_path + 'tfidf_values.hdf', 'r') as f: 215 | values = f.root.values.read() 216 | 217 | vectorizer.vocabulary_ = dict(zip(keys, values)) 218 | 219 | with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'r') as f: 220 | vectorizer.stop_words_ = set(f.root.stop_words.read()) 221 | 222 | return vectorizer 223 | 224 | def remove_return_lines_and_quotes(self, text): 225 | text = text.replace('\n', ' ') 226 | text = text.replace('\t', ' ') 227 | text = text.replace('\r', ' ') 228 | text = text.replace('"', '') 229 | return text 230 | 231 | def extract(self, documents=None, vectorizer=None, tokenizer='custom', tfidf_corpus='news'): 232 | 233 | try: 234 | vectorizer = self.get_tfidf_model() 235 | except (EOFError, IOError), e: 236 | vectorizer = self.train_tfidf(tokenizer, tfidf_corpus) 237 | 238 | docs = vectorizer.transform(documents) 239 | 240 | feature_names = vectorizer.get_feature_names() 241 | features = [] 242 | for i in xrange(docs.shape[0]): 243 | 244 | sort_score_indices = np.argsort(docs[i, :].data) 245 | top_n_indices = self.num_kewyords if (len(sort_score_indices)) > self.num_kewyords else len(sort_score_indices) 246 | top_features_indices = [] 247 | 248 | if top_n_indices: 249 | top_features_indices = docs[i, :].indices[np.argsort(docs[i, :].data)[::-1][:top_n_indices]] 250 | 251 | top_features_names = [feature_names[f] for f in top_features_indices] 252 | 253 | # Extract most common bigrams. TFIDF gives more relevance to 254 | # unigrams than bigrams 255 | # bigrams = self.extract_bigrams(documents[i]) 256 | # top_features_names = list(set(top_features_names + bigrams)) 257 | 258 | features.append(top_features_names) 259 | 260 | return features 261 | 262 | if __name__ == '__main__': 263 | k = KeywordsExtractor(num_kewyords=100, verbose=True, data_path='../data/') 264 | document = "Iain Duncan Smith has criticised the government's desperate search for savings in his first interview since resigning as work and pensions secretary." 265 | document = "High-quality, contemporary facing brick available with a smooth or textured finish. There are bricks. And there are bricks you can design with. If you're used to assuming a choice of one colour and one finish, why not choose a brick that can become part of your design process instead? The Oakland range of brick can add a spark to architectural designs - whether your next project is traditional, contemporary or avant-garde, choose from a broad range of precision facing bricks with dynamic colours that will help bring the final project to life. Oakland Brick is available in a range of 20 colour and texture combinations. LOW EFFLORESCENCE\r\n\r\nAG's brick range is free from soluble salt, meaning that Oakland Brick's levels of efflorescence are extremely low.\r\n\r\n\r\nCOMPLEMENTARY SPECIALS\r\n\r\nA range of complementary specials are available.\r\n\r\n\r\nBRE GREEN GUIDE 'A' RATED\r\n\r\nOakland Brick is produced in the UK from locally sourced materials and manufactured with 90 harvested rain water and 100 renewable energy in the production process.\n\nProperties: Smooth, Textured, 1, BS EN 7713, ISO 9001, ISO 14001, A+, F2, Frost Resistant, A1 Oakland Brick - A contemporary brick with clean, crisp lines" 266 | print k.extract(documents=[document])[0] 267 | # k.get_specifiedby_corpus() 268 | # cProfile.run("k.extract(documents=[document])[0]", 'restats') 269 | p = pstats.Stats('restats') 270 | p.sort_stats('cumulative').print_stats(30) 271 | 272 | 273 | #print k.train_tfidf() 274 | #print k.get_tfidf_model() 275 | -------------------------------------------------------------------------------- /features/main_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup, Comment 3 | from sklearn import preprocessing 4 | import numpy as np 5 | from goose import Goose 6 | from goose.article import Article 7 | from goose.configuration import Configuration 8 | from goose.cleaners import DocumentCleaner 9 | from goose.extractors.content import ContentExtractor 10 | from goose.extractors.images import ImageExtractor 11 | from goose.outputformatters import OutputFormatter 12 | 13 | 14 | class MainTextExtractor(object): 15 | 16 | @classmethod 17 | def _remove_chars(cls, text): 18 | stripped = text.strip(' \t\n\r') 19 | if not stripped: 20 | return None 21 | else: 22 | return stripped 23 | 24 | @classmethod 25 | def _main_paragraph_text(cls, html): 26 | """ 27 | Method to extract the main content by only looking 28 | at the html p tag elements. 29 | """ 30 | 31 | soup = BeautifulSoup(html, 'html.parser') 32 | min_length = 30 33 | feature_text = [] 34 | 35 | for p in soup.find_all('p'): 36 | text_content = MainTextExtractor._remove_chars(p.get_text(strip=True)) 37 | if text_content and (len(text_content) > min_length): 38 | feature_text.append(text_content) 39 | 40 | return ' \n'.join(feature_text) 41 | 42 | @classmethod 43 | def removeCharacters(self, s): 44 | s = s.strip(' \t\n\r') 45 | if ((s == "") | (s==None)): 46 | return None 47 | return s 48 | 49 | @classmethod 50 | def _parse_tags(cls, html): 51 | 52 | excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta', 'header', 'footer', 53 | 'link', 'body', 'input', 'form', 'a'] 54 | minimum_text_node_length = 8 55 | 56 | y_data = [] 57 | text_data = [] 58 | tag_signatures = [] 59 | 60 | soup = BeautifulSoup(html, 'html.parser') 61 | 62 | for tag in soup.findAll(): 63 | 64 | path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p])) 65 | tag_signature = '.'.join([path, tag.name]) 66 | 67 | if (tag.name not in excluded_tags) and ('table' not in path): 68 | 69 | tag_text = [] 70 | for text in tag.contents: 71 | if isinstance(text, Comment): 72 | continue 73 | try: 74 | text = text.strip() 75 | aux = BeautifulSoup(text, 'html.parser') 76 | if aux.find() is None: 77 | tag_text.append(text) 78 | except Exception, e: 79 | pass 80 | 81 | tag_text = "\n".join(tag_text) 82 | 83 | if tag_text and len(tag_text) > minimum_text_node_length: 84 | if tag_text not in text_data: 85 | 86 | # Remove line returns and tabs 87 | tag_text = cls._remove_chars(tag_text) 88 | if tag_text: 89 | y_data.append(len(tag_text)) 90 | text_data.append(tag_text) 91 | tag_signatures.append(path) 92 | 93 | x = np.array(y_data) 94 | return x, text_data, tag_signatures 95 | 96 | @classmethod 97 | def _find_intervals(cls, x): 98 | """ 99 | The main content is ofteb located between two points l_ini and l_end. 100 | This method aims to find two pointers where the distance between l_ini 101 | and l_end is minimum and the number of characters between 102 | these pointers is maximum. 103 | """ 104 | x = np.array(x) 105 | x_length = x.shape[0] 106 | total = np.sum(x) 107 | mean = np.mean(x) 108 | 109 | # Locate where the maximum is 110 | max_pointer = np.argmax(x) 111 | max_accum = 0 112 | pointer_left, pointer_right = max_pointer, max_pointer 113 | 114 | # Find tags higher than average in the left neighbourhood until there 115 | # is no hope 116 | hope_left_state, hope_right_state = True, True 117 | while pointer_left > 0 and hope_left_state: 118 | # if the x[pointer_left - 1] is greater than the mean, move 119 | # the pointer one to the left 120 | if x[pointer_left - 1] > mean: 121 | pointer_left -= 1 122 | else: 123 | # Is it worth to move to the left? If yes, there is hope. 124 | pointer_left_hope = pointer_left - 5 125 | if pointer_left_hope < 0: 126 | pointer_left_hope = 0 127 | 128 | hope_left = x[pointer_left_hope:pointer_left][::-1] 129 | max_hope_left_value = np.max(hope_left) 130 | max_hope_left_idx = pointer_left - np.argmax(hope_left) - 1 131 | 132 | if max_hope_left_value > mean: 133 | pointer_left = max_hope_left_idx 134 | else: 135 | hope_left_state = False 136 | 137 | # Same reasoning as previous one but for the right 138 | while (pointer_right + 1) < x_length and hope_right_state: 139 | if x[pointer_right + 1] > mean: 140 | pointer_right += 1 141 | else: 142 | # Is it worth to move to the right? 143 | pointer_right_hope = pointer_right + 5 144 | 145 | if pointer_right_hope > len(x): 146 | pointer_right_hope = len(x) 147 | 148 | hope_right = x[(pointer_right + 1): pointer_right_hope] 149 | max_hope_right_value = np.max(hope_right) 150 | max_hope_right_idx = pointer_right + np.argmax(hope_right) + 1 151 | 152 | if max_hope_right_value > mean: 153 | pointer_right = max_hope_right_idx 154 | else: 155 | hope_right_state = False 156 | 157 | # Find a cutoff pointer where the number of characters on the left is 158 | # equal to the number of characters on the right. 159 | # This is just for visualization 160 | accumulated = 0 161 | for i in xrange(0, x_length): 162 | accumulated += x[i] 163 | if accumulated >= (total / 2): 164 | cutoff_point = i 165 | break 166 | 167 | return pointer_left - 1, pointer_right + 1, cutoff_point, mean, max_pointer 168 | 169 | @classmethod 170 | def _refine_intervals(cls, max_tag_signature, max_pointer, text_data, l_ini, l_end): 171 | """ 172 | This method runs after findIntervals and intends to narrow down 173 | where the left and right pointers are. 174 | """ 175 | 176 | max_tag_signature_parts = max_tag_signature[max_pointer].split('.') 177 | tag_max_match = np.zeros(l_end) 178 | 179 | for i in xrange(l_ini, l_end): 180 | tag_signature_aux_parts = max_tag_signature[i].split('.') 181 | max_match = 0 182 | 183 | for j in range(0, len(max_tag_signature_parts)): 184 | try: 185 | if max_tag_signature_parts[j] == tag_signature_aux_parts[j]: 186 | max_match = j 187 | else: 188 | break 189 | except IndexError, e: 190 | break 191 | 192 | tag_max_match[i] = max_match 193 | 194 | tag_max_match = np.asarray(tag_max_match, dtype='float64') 195 | min_max_scaler = preprocessing.MinMaxScaler() 196 | tag_max_match = min_max_scaler.fit_transform(tag_max_match.reshape(-1, 1)) 197 | 198 | tag_max_match = tag_max_match.reshape(1, -1)[0] 199 | 200 | return l_ini, l_end, tag_max_match 201 | 202 | @classmethod 203 | def _combined_tags_text(cls, html): 204 | 205 | x, text_data, tag_signatures = MainTextExtractor._parse_tags(html) 206 | # print x 207 | if x.any(): 208 | l_ini, l_end, cutoff_point, mean, max_pointer = MainTextExtractor._find_intervals(x) 209 | l_ini, l_end, tag_max_match = MainTextExtractor._refine_intervals(tag_signatures, max_pointer, text_data, l_ini, l_end) 210 | 211 | final_text = [] 212 | for i in xrange(l_ini, l_end): 213 | if tag_max_match[i] > 0.65: 214 | final_text.append(text_data[i]) 215 | 216 | # This is only for debugging - Plot the html distribution 217 | # import matplotlib.pyplot as plt 218 | # mean_line = [mean] * x.shape[0] 219 | # #std = np.std(y_data) 220 | 221 | # plt.figure(1) 222 | # plt.subplot(211) 223 | # plt.plot(x) 224 | # plt.plot(mean_line) 225 | # plt.axvline(x=cutoff_point,linewidth=2, color='purple') 226 | # plt.axvline(x=l_ini,linewidth=2, color='r') 227 | # plt.axvline(x=l_end,linewidth=2, color='r') 228 | # plt.ylabel('Num Characters') 229 | # plt.xlabel('Tag Location') 230 | # plt.title('Content Distribution in HTML Page') 231 | # plt.show() 232 | 233 | return '\n'.join(final_text) 234 | else: 235 | return None 236 | 237 | @classmethod 238 | def _goose_cleaned_text(cls, html, page_html): 239 | article = Article() 240 | article.raw_html = html 241 | article.raw_doc = page_html 242 | article.doc = article.raw_doc 243 | 244 | goose_extractor = ContentExtractor(Configuration(), article) 245 | goose_cleaner = DocumentCleaner(Configuration(), article) 246 | goose_formatter = OutputFormatter(Configuration(), article) 247 | # goose_image_extractor = ImageExtractor(Configuration(), article) use 248 | 249 | try: 250 | article.doc = goose_cleaner.clean() 251 | article.top_node = goose_extractor.calculate_best_node() 252 | if article.top_node is not None: 253 | article.top_node = goose_extractor.post_cleanup() 254 | article.cleaned_text = goose_formatter.get_formatted_text() 255 | except UnicodeDecodeError, e: 256 | article.top_node = None 257 | 258 | return article.cleaned_text 259 | 260 | @classmethod 261 | def extract(cls, html, page_html): 262 | return [MainTextExtractor._goose_cleaned_text(html, page_html), 263 | MainTextExtractor._combined_tags_text(html), 264 | MainTextExtractor._main_paragraph_text(html)] 265 | 266 | if __name__ == '__main__': 267 | # this packages should be here but we only need the for improving the 268 | # extractor therefore it might interfere with the rest of the project 269 | mE = MainTextExtractor() 270 | target_url = 'http://www.toshiba-aircon.co.uk/products/refrigerant-leak-detection-solutions/refrigerant-leak-detection-solutions/rbc-aip4' 271 | article = Goose().extract(target_url) 272 | print mE.extract(article.raw_html, article.raw_doc) -------------------------------------------------------------------------------- /pyteaser_c.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from math import fabs 3 | from re import split as regex_split, sub as regex_sub 4 | #import nltk 5 | #from nltk import FreqDist 6 | #from nltk.book import * 7 | 8 | 9 | stopWords = [ 10 | "-", " ", ",", ".", "a", "e", "i", "o", "u", "t", "about", "above", 11 | "above", "across", "after", "afterwards", "again", "against", "all", 12 | "almost", "alone", "along", "already", "also", "although", "always", 13 | "am", "among", "amongst", "amoungst", "amount", "an", "and", 14 | "another", "any", "anyhow", "anyone", "anything", "anyway", 15 | "anywhere", "are", "around", "as", "at", "back", "be", "became", 16 | "because", "become", "becomes", "becoming", "been", "before", 17 | "beforehand", "behind", "being", "below", "beside", "besides", 18 | "between", "beyond", "both", "bottom", "but", "by", "call", "can", 19 | "cannot", "can't", "co", "con", "could", "couldn't", "de", 20 | "describe", "detail", "did", "do", "done", "down", "due", "during", 21 | "each", "eg", "eight", "either", "eleven", "else", "elsewhere", 22 | "empty", "enough", "etc", "even", "ever", "every", "everyone", 23 | "everything", "everywhere", "except", "few", "fifteen", "fifty", 24 | "fill", "find", "fire", "first", "five", "for", "former", 25 | "formerly", "forty", "found", "four", "from", "front", "full", 26 | "further", "get", "give", "go", "got", "had", "has", "hasnt", 27 | "have", "he", "hence", "her", "here", "hereafter", "hereby", 28 | "herein", "hereupon", "hers", "herself", "him", "himself", "his", 29 | "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", 30 | "into", "is", "it", "its", "it's", "itself", "just", "keep", "last", 31 | "latter", "latterly", "least", "less", "like", "ltd", "made", "make", 32 | "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", 33 | "moreover", "most", "mostly", "move", "much", "must", "my", "myself", 34 | "name", "namely", "neither", "never", "nevertheless", "new", "next", 35 | "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", 36 | "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", 37 | "onto", "or", "other", "others", "otherwise", "our", "ours", 38 | "ourselves", "out", "over", "own", "part", "people", "per", 39 | "perhaps", "please", "put", "rather", "re", "said", "same", "see", 40 | "seem", "seemed", "seeming", "seems", "several", "she", "should", 41 | "show", "side", "since", "sincere", "six", "sixty", "so", "some", 42 | "somehow", "someone", "something", "sometime", "sometimes", 43 | "somewhere", "still", "such", "take", "ten", "than", "that", "the", 44 | "their", "them", "themselves", "then", "thence", "there", 45 | "thereafter", "thereby", "therefore", "therein", "thereupon", 46 | "these", "they", "thickv", "thin", "third", "this", "those", 47 | "though", "three", "through", "throughout", "thru", "thus", "to", 48 | "together", "too", "top", "toward", "towards", "twelve", "twenty", 49 | "two", "un", "under", "until", "up", "upon", "us", "use", "very", 50 | "via", "want", "was", "we", "well", "were", "what", "whatever", 51 | "when", "whence", "whenever", "where", "whereafter", "whereas", 52 | "whereby", "wherein", "whereupon", "wherever", "whether", "which", 53 | "while", "whither", "who", "whoever", "whole", "whom", "whose", 54 | "why", "will", "with", "within", "without", "would", "yet", "you", 55 | "your", "yours", "yourself", "yourselves", "the", "reuters", "news", 56 | "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", 57 | "sunday", "mon", "tue", "wed", "thu", "fri", "sat", "sun", 58 | "rappler", "rapplercom", "inquirer", "yahoo", "home", "sports", 59 | "1", "10", "2012", "sa", "says", "tweet", "pm", "home", "homepage", 60 | "sports", "section", "newsinfo", "stories", "story", "photo", 61 | "2013", "na", "ng", "ang", "year", "years", "percent", "ko", "ako", 62 | "yung", "yun", "2", "3", "4", "5", "6", "7", "8", "9", "0", "time", 63 | "january", "february", "march", "april", "may", "june", "july", 64 | "august", "september", "october", "november", "december", 65 | "philippine", "government", "police", "manila" 66 | ] 67 | ideal = 20.0 68 | 69 | 70 | def SummarizeUrl(url): 71 | summaries = [] 72 | try: 73 | article = grab_link(url) 74 | except IOError: 75 | print 'IOError' 76 | return None 77 | 78 | #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen 79 | if not article or not article.cleaned_text or not article.title: 80 | return None 81 | 82 | text = str(article.cleaned_text.encode('utf-8', 'ignore')) 83 | title = str(article.title.encode('utf-8', 'ignore')) 84 | print article.author 85 | summaries = Summarize(title, text) 86 | return summaries 87 | 88 | def SummarizePage(html): 89 | summaries = [] 90 | try: 91 | article = grab_page(html) 92 | except IOError: 93 | print 'IOError' 94 | return None 95 | 96 | #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen 97 | if not article or not article.cleaned_text or not article.title: 98 | return None 99 | 100 | text = str(article.cleaned_text.encode('utf-8', 'ignore')) 101 | title = str(article.title.encode('utf-8', 'ignore')) 102 | summaries = Summarize(title, text) 103 | return summaries 104 | 105 | def GetArticle(html): 106 | try: 107 | article = grab_page(html) 108 | except IOError: 109 | print 'IOError' 110 | return None 111 | #print ">>> " + str(high) + " - " + item['Source'] + " >>> " + highsen 112 | if not article or not article.cleaned_text or not article.title: 113 | return None 114 | 115 | text = str(article.cleaned_text.encode('utf-8', 'ignore')) 116 | title = str(article.title.encode('utf-8', 'ignore')) 117 | return article 118 | 119 | 120 | def Summarize(title, text): 121 | summaries = [] 122 | sentences = split_sentences(text) 123 | #print sentences 124 | keys = keywords(text) 125 | titleWords = split_words(title) 126 | 127 | if len(sentences) <= 5: 128 | return sentences 129 | 130 | #score setences, and use the top 5 sentences 131 | ranks = score(sentences, titleWords, keys).most_common(5) 132 | for rank in ranks: 133 | summaries.append(rank[0]) 134 | 135 | return summaries 136 | 137 | 138 | def grab_link(inurl): 139 | #extract article information using Python Goose 140 | from goose import Goose 141 | try: 142 | article = Goose().extract(url=inurl) 143 | return article 144 | except ValueError: 145 | print 'Goose error grab' 146 | return None 147 | return None 148 | 149 | def grab_page(html): 150 | #extract article information using Python Goose 151 | from goose import Goose 152 | try: 153 | article = Goose().extract_page(raw_html = html) 154 | return article 155 | except ValueError: 156 | print 'Goose error grab' 157 | return None 158 | return None 159 | 160 | 161 | def score(sentences, titleWords, keywords): 162 | #score sentences based on different features 163 | 164 | senSize = len(sentences) 165 | ranks = Counter() 166 | for i, s in enumerate(sentences): 167 | sentence = split_words(s) 168 | titleFeature = title_score(titleWords, sentence) 169 | sentenceLength = length_score(sentence) 170 | sentencePosition = sentence_position(i+1, senSize) 171 | sbsFeature = sbs(sentence, keywords) 172 | dbsFeature = dbs(sentence, keywords) 173 | frequency = (sbsFeature + dbsFeature) / 2.0 * 10.0 174 | 175 | #weighted average of scores from four categories 176 | totalScore = (titleFeature*1.5 + frequency*2.0 + 177 | sentenceLength*1.0 + sentencePosition*1.0) / 4.0 178 | ranks[s] = totalScore 179 | return ranks 180 | 181 | 182 | def sbs(words, keywords): 183 | score = 0.0 184 | if len(words) == 0: 185 | return 0 186 | for word in words: 187 | if word in keywords: 188 | score += keywords[word] 189 | return (1.0 / fabs(len(words)) * score)/10.0 190 | 191 | 192 | def dbs(words, keywords): 193 | if (len(words) == 0): 194 | return 0 195 | 196 | summ = 0 197 | first = [] 198 | second = [] 199 | 200 | for i, word in enumerate(words): 201 | if word in keywords: 202 | score = keywords[word] 203 | if first == []: 204 | first = [i, score] 205 | else: 206 | second = first 207 | first = [i, score] 208 | dif = first[0] - second[0] 209 | summ += (first[1]*second[1]) / (dif ** 2) 210 | 211 | # number of intersections 212 | k = len(set(keywords.keys()).intersection(set(words))) + 1 213 | return (1/(k*(k+1.0))*summ) 214 | 215 | 216 | def split_words(text): 217 | #split a string into array of words 218 | try: 219 | text = regex_sub(r'[^\w ]', '', text) # strip special chars 220 | return [x.strip('.').lower() for x in text.split()] 221 | except TypeError: 222 | return None 223 | 224 | 225 | def keywords(text): 226 | #sentences = nltk.sent_tokenize(text) 227 | #sentences = [nltk.word_tokenize(sent) for sent in sentences] 228 | #sentences = [nltk.pos_tag(sent) for sent in sentences] 229 | #print sentences 230 | 231 | #fdist1 = FreqDist(text) 232 | #print fdist1.most_common(50) 233 | #for i in sentences: 234 | # print i + "- \n -" 235 | # pass 236 | #print "--" 237 | 238 | #sentences = [nltk.word_tokenize(sent) for sent in sentences] [2] 239 | #sentences = [nltk.pos_tag(sent) for sent in sentences] 240 | 241 | """get the top 10 keywords and their frequency scores 242 | ignores blacklisted words in stopWords, 243 | counts the number of occurrences of each word, 244 | and sorts them in reverse natural order (so descending) 245 | by number of occurrences 246 | """ 247 | from operator import itemgetter # for sorting 248 | text = split_words(text) 249 | numWords = len(text) # of words before removing blacklist words 250 | text = [x for x in text if x not in stopWords] 251 | freq = Counter() 252 | for word in text: 253 | freq[word] += 1 254 | 255 | minSize = min(10, len(freq)) 256 | keywords = tuple(freq.most_common(minSize)) # get first 10 257 | keywords = dict((x, y) for x, y in keywords) # recreate a dict 258 | 259 | for k in keywords: 260 | articleScore = keywords[k]*1.0 / numWords 261 | keywords[k] = articleScore * 1.5 + 1 262 | 263 | keywords = sorted(keywords.iteritems(), key=itemgetter(1)) 264 | keywords.reverse() 265 | #print(keywords) 266 | return dict(keywords) 267 | 268 | 269 | def split_sentences(text): 270 | ''' 271 | The regular expression matches all sentence ending punctuation and splits the string at those points. 272 | At this point in the code, the list looks like this ["Hello, world", "!" ... ]. The punctuation and all quotation marks 273 | are separated from the actual text. The first s_iter line turns each group of two items in the list into a tuple, 274 | excluding the last item in the list (the last item in the list does not need to have this performed on it). Then, 275 | the second s_iter line combines each tuple in the list into a single item and removes any whitespace at the beginning 276 | of the line. Now, the s_iter list is formatted correctly but it is missing the last item of the sentences list. The 277 | second to last line adds this item to the s_iter list and the last line returns the full list. 278 | ''' 279 | text = text.decode('utf-8') 280 | sentences = regex_split('(? 0 and normalized <= 0.1: 307 | return 0.17 308 | elif normalized > 0.1 and normalized <= 0.2: 309 | return 0.23 310 | elif normalized > 0.2 and normalized <= 0.3: 311 | return 0.14 312 | elif normalized > 0.3 and normalized <= 0.4: 313 | return 0.08 314 | elif normalized > 0.4 and normalized <= 0.5: 315 | return 0.05 316 | elif normalized > 0.5 and normalized <= 0.6: 317 | return 0.04 318 | elif normalized > 0.6 and normalized <= 0.7: 319 | return 0.06 320 | elif normalized > 0.7 and normalized <= 0.8: 321 | return 0.04 322 | elif normalized > 0.8 and normalized <= 0.9: 323 | return 0.04 324 | elif normalized > 0.9 and normalized <= 1.0: 325 | return 0.15 326 | else: 327 | return 0 328 | --------------------------------------------------------------------------------