├── tests ├── extractors │ ├── __init__.py │ ├── metas.py │ ├── opengraph.py │ ├── authors.py │ ├── links.py │ ├── tweets.py │ ├── title.py │ ├── publishdate.py │ ├── videos.py │ └── tags.py ├── data │ ├── extractors │ │ ├── links │ │ │ ├── test_links.json │ │ │ └── test_links.html │ │ ├── tweets │ │ │ ├── test_tweet.json │ │ │ └── test_tweet.html │ │ ├── title │ │ │ ├── test_title_empty.json │ │ │ ├── test_title_opengraph.json │ │ │ ├── test_title_empty.html │ │ │ └── test_title_opengraph.html │ │ ├── publishdate │ │ │ ├── test_publish_date.json │ │ │ ├── test_publish_date_schema.json │ │ │ ├── test_publish_date_article.json │ │ │ ├── test_publish_date_rnews.json │ │ │ ├── test_publish_date.html │ │ │ ├── test_publish_date_rnews.html │ │ │ ├── test_publish_date_article.html │ │ │ └── test_publish_date_schema.html │ │ ├── images │ │ │ ├── test_basic_image │ │ │ │ ├── 50850547cc7310bc53e30e802c6318f1 │ │ │ │ └── test_basic_image.json │ │ │ ├── test_known_image_empty_src │ │ │ │ ├── test_known_image_empty_src.json │ │ │ │ └── test_known_image_empty_src.html │ │ │ ├── test_opengraph_tag │ │ │ │ ├── test_opengraph_tag.json │ │ │ │ └── test_opengraph_tag.html │ │ │ ├── test_known_image_css_id │ │ │ │ ├── test_known_image_css_id.json │ │ │ │ └── test_known_image_css_id.html │ │ │ ├── test_known_image_css_class │ │ │ │ ├── test_known_image_css_class.json │ │ │ │ └── test_known_image_css_class.html │ │ │ ├── test_known_image_css_parent_id │ │ │ │ ├── test_known_image_css_parent_id.json │ │ │ │ └── test_known_image_css_parent_id.html │ │ │ ├── test_known_image_name_parent │ │ │ │ ├── test_known_image_name_parent.json │ │ │ │ └── test_known_image_name_parent.html │ │ │ └── test_known_image_css_parent_class │ │ │ │ ├── test_known_image_css_parent_class.json │ │ │ │ └── test_known_image_css_parent_class.html │ │ ├── content │ │ │ ├── test_articlebody_tag.json │ │ │ ├── test_articlebody_itemprop.json │ │ │ ├── test_articlebody_attribute.json │ │ │ ├── test_issue129.json │ │ │ ├── test_okaymarketing.json │ │ │ ├── test_usatoday_issue_74.json │ │ │ ├── test_issue115.json │ │ │ ├── test_mashable_issue_74.json │ │ │ ├── test_politico.json │ │ │ ├── test_issue32.json │ │ │ ├── test_businessinsider3.json │ │ │ ├── test_espn.json │ │ │ ├── test_elmondo1.json │ │ │ ├── test_liberation.json │ │ │ ├── test_issue28.json │ │ │ ├── test_techcrunch1.json │ │ │ ├── test_businessWeek1.json │ │ │ ├── test_cnn1.json │ │ │ ├── test_businessWeek3.json │ │ │ ├── test_yahoo.json │ │ │ ├── test_time.json │ │ │ ├── test_foxNews.json │ │ │ ├── test_businessWeek2.json │ │ │ ├── test_cnbc1.json │ │ │ ├── test_cbslocal.json │ │ │ ├── test_huffingtonPost2.json │ │ │ ├── test_allnewlyrics1.json │ │ │ ├── test_issue4.json │ │ │ ├── test_cnet.json │ │ │ ├── test_aolNews.json │ │ │ ├── test_articlebody_tag.html │ │ │ ├── test_issue25.json │ │ │ ├── test_articlebody_attribute.html │ │ │ ├── test_articlebody_itemprop.html │ │ │ ├── test_cnn_arabic.json │ │ │ ├── test_time2.json │ │ │ ├── test_marketplace.json │ │ │ ├── test_engadget.json │ │ │ ├── test_get_canonical_url.json │ │ │ ├── test_lefigaro.json │ │ │ ├── test_bbc_chinese.json │ │ │ ├── test_testHuffingtonPost.json │ │ │ ├── test_msn1.json │ │ │ ├── test_donga_korean.json │ │ │ ├── test_issue24.html │ │ │ ├── test_issue24.json │ │ │ └── test_elpais.json │ │ ├── authors │ │ │ ├── test_author_schema.json │ │ │ └── test_author_schema.html │ │ ├── tags │ │ │ ├── test_tags_abcau.json │ │ │ ├── test_tags_kexp.json │ │ │ ├── test_tags_deadline.json │ │ │ ├── test_tags_wnyc.json │ │ │ └── test_tags_cnet.json │ │ ├── opengraph │ │ │ ├── test_opengraph.json │ │ │ └── test_opengraph.html │ │ └── videos │ │ │ ├── test_iframe.json │ │ │ ├── test_embed.json │ │ │ ├── test_object.json │ │ │ ├── test_iframe.html │ │ │ ├── test_embed.html │ │ │ └── test_object.html │ └── parser │ │ └── test1.html ├── __init__.py ├── article.py └── configuration.py ├── requirements.txt ├── MANIFEST.in ├── THANKS ├── .gitignore ├── .travis.yml ├── goose ├── resources │ ├── images │ │ └── known-image-css.txt │ └── text │ │ ├── stopwords-nl.txt │ │ ├── stopwords-ko.txt │ │ ├── stopwords-fi.txt │ │ ├── stopwords-da.txt │ │ ├── stopwords-no.txt │ │ ├── stopwords-nb.txt │ │ ├── stopwords-zh.txt │ │ ├── stopwords-pt.txt │ │ ├── stopwords-ar.txt │ │ ├── stopwords-pl.txt │ │ ├── stopwords-fr.txt │ │ ├── stopwords-es.txt │ │ ├── stopwords-hu.txt │ │ ├── stopwords-it.txt │ │ └── stopwords-ru.txt ├── version.py ├── extractors │ ├── __init__.py │ ├── links.py │ ├── opengraph.py │ ├── tweets.py │ ├── authors.py │ ├── tags.py │ ├── publishdate.py │ ├── title.py │ ├── metas.py │ └── videos.py ├── video.py ├── network.py ├── image.py ├── __init__.py ├── configuration.py └── utils │ ├── __init__.py │ └── images.py └── setup.py /tests/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | lxml 3 | cssselect 4 | jieba 5 | beautifulsoup 6 | nltk 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include goose/resources/images * 2 | recursive-include goose/resources/text * -------------------------------------------------------------------------------- /tests/data/extractors/links/test_links.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/links/", 3 | "expected": { 4 | "links": 2 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/tweets/test_tweet.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/tweet/", 3 | "expected": { 4 | "tweets": 2 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /THANKS: -------------------------------------------------------------------------------- 1 | Thanks to all who has contribute to python-goose. 2 | You can find the contributors list here : 3 | https://github.com/grangier/python-goose/graphs/contributors -------------------------------------------------------------------------------- /tests/data/extractors/title/test_title_empty.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_title_empty.html", 3 | "expected": { 4 | "title": "" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | goose.egg-info/ 3 | build/ 4 | dist/ 5 | .DS_Store* 6 | ._.DS_Store* 7 | env/ 8 | *~ 9 | .idea 10 | ._* 11 | *.egg 12 | venv/ 13 | goose_extractor.egg-info/ 14 | -------------------------------------------------------------------------------- /tests/data/extractors/title/test_title_opengraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_opengraphcontent", 3 | "expected": { 4 | "title": "Good article title" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://example.com/example", 3 | "expected": { 4 | "publish_date": "2014-06-30T16:54:02+00:00" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://example.com/example", 3 | "expected": { 4 | "publish_date": "2014-10-09T12:06:16" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_article.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://example.com/example", 3 | "expected": { 4 | "publish_date": "2012-01-11T15:55:01+00:00" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_rnews.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://example.com/example", 3 | "expected": { 4 | "publish_date": "2010-02-22T11:53:04+00:00" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/python-goose/HEAD/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_rnews.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_articlebody_tag.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_opengraphcontent", 3 | "expected": { 4 | "cleaned_text": "Search-and-rescue teams were mobilized " 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_article.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_articlebody_itemprop.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_opengraphcontent", 3 | "expected": { 4 | "cleaned_text": "Search-and-rescue teams were mobilized " 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - 2.6 5 | - 2.7 6 | 7 | install: 8 | - pip install -r requirements.txt --use-mirrors 9 | - python setup.py install 10 | 11 | script: python setup.py test 12 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_articlebody_attribute.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_opengraphcontent", 3 | "expected": { 4 | "cleaned_text": "Search-and-rescue teams were mobilized " 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_issue129.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://lostinjit.blogspot.fr/2011/10/pypy-and-road-towards-scipy.html", 3 | "expected": { 4 | "cleaned_text": "Recent PyPys effort to bring NumPy and the associated fundraiser" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /goose/resources/images/known-image-css.txt: -------------------------------------------------------------------------------- 1 | latimes.com^thumbnail 2 | cnn.com^storytext|cnn_strycntntlft 3 | foxnews.com^entry-content 4 | msn.com^articleText 5 | go.com^mediaimage 6 | lefigaro.fr^photo center 7 | cadres.apec.fr^noFieldsTable 8 | emploi.lesechos.fr^offerHeader 9 | linkfinance.fr^offerHeader -------------------------------------------------------------------------------- /tests/data/extractors/authors/test_author_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/tweet/", 3 | "expected": { 4 | "authors": [ 5 | "KEVIN SACK", 6 | "ADAM NOSSITER", 7 | "PAM BELLUCK", 8 | "SHERI FINK" 9 | ] 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tests/data/extractors/tags/test_tags_abcau.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.abc.net.au/news/2013-04-22/swimming-greats-say-cuts-a-shame/4644544", 3 | "expected": { 4 | "tags": [ 5 | "olympics-summer", 6 | "australia", 7 | "swimming" 8 | ] 9 | } 10 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_okaymarketing.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4", 3 | "expected": { 4 | "cleaned_text": "If you are operating a local business there is something you can do right now to gain an advantage over your competition." 5 | } 6 | } -------------------------------------------------------------------------------- /tests/data/extractors/tags/test_tags_kexp.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", 3 | "expected": { 4 | "tags": [ 5 | "kusp film review", 6 | "Stand Up Guys", 7 | "film", 8 | "Dennis Morton" 9 | ] 10 | } 11 | } -------------------------------------------------------------------------------- /tests/data/extractors/tags/test_tags_deadline.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.deadline.com/2013/06/deadline-big-media-with-david-lieberman-episode-38/", 3 | "expected": { 4 | "tags": [ 5 | "Deadline Big Media", 6 | "TiVo", 7 | "Amazon Prime", 8 | "Steve Ballmer" 9 | ] 10 | } 11 | } -------------------------------------------------------------------------------- /tests/data/extractors/tags/test_tags_wnyc.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.wnyc.org/shows/heresthething/2013/may/27/", 3 | "expected": { 4 | "tags": [ 5 | "Life", 6 | "alec baldwin", 7 | "other desert cities", 8 | "News", 9 | "Music", 10 | "stacy keach" 11 | ] 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_usatoday_issue_74.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.usatoday.com/story/tech/columnist/talkingtech/2014/01/25/namm-2014---ik-multimedias-rings-to-make-music/4863193/", 3 | "expected": { 4 | "cleaned_text": "ANAHEIM, Calif. — Musicians often show off lots of ring bling —but rarely have rings been thought of for making music — until now." 5 | } 6 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_issue115.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://blogs.wsj.com/accelerators/2014/06/03/jessica-livingston-why-startups-need-to-focus-on-sales-not-marketing/", 3 | "expected": { 4 | "cleaned_text": "JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-nl.txt: -------------------------------------------------------------------------------- 1 | aan 2 | af 3 | al 4 | als 5 | bij 6 | dan 7 | dat 8 | die 9 | dit 10 | een 11 | en 12 | er 13 | had 14 | heb 15 | hem 16 | het 17 | hij 18 | hoe 19 | hun 20 | ik 21 | in 22 | is 23 | je 24 | kan 25 | me 26 | men 27 | met 28 | mij 29 | nog 30 | nu 31 | of 32 | ons 33 | ook 34 | te 35 | tot 36 | uit 37 | van 38 | was 39 | wat 40 | we 41 | wel 42 | wij 43 | zal 44 | ze 45 | zei 46 | zij 47 | zo 48 | zou 49 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_mashable_issue_74.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://mashable.com/2014/01/26/square-cofounder-jim-mckelvey/", 3 | "expected": { 4 | "cleaned_text": "Some 2,000 miles away from Square's massive new headquarters in San Francisco, Jim McKelvey is standing in work boots and a thick dock coat trying to revive a city.\n\nMcKelvey founded the mobile payments company in 2009 with Jack Dorsey, who had previously helped launch Twitter." 5 | } 6 | } -------------------------------------------------------------------------------- /tests/data/extractors/opengraph/test_opengraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://exemple.com/test_opengraphcontent", 3 | "expected": { 4 | "opengraph": { 5 | "url": "http://www.somenews.com/2012/09/19/nyregion/some-news-article.html?pagewanted=all", 6 | "image": "http://graphics8.somenews.com/images/2012/09/19/region/some-news-image.jpg", 7 | "type": "article", 8 | "description": "Some News Happened in New York", 9 | "title": "Some News Article Story" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-ko.txt: -------------------------------------------------------------------------------- 1 | 을 2 | 의 3 | 에 4 | 이 5 | 를 6 | 으로 7 | 은 8 | 는 9 | 가 10 | 로 11 | 하고 12 | 과 13 | 에서 14 | 도 15 | 와 16 | 이다 17 | 고 18 | 부터 19 | 까지 20 | 께 21 | 에는 22 | 이라고 23 | 만 24 | 라고 25 | 보다 26 | 에도 27 | 다 28 | 토록 29 | 에게 30 | 나 31 | 대로 32 | 에서는 33 | 이나 34 | 이며 35 | 요 36 | 든 37 | 으로써 38 | 같이 39 | 로는 40 | 밖에 41 | 과의 42 | 며 43 | 로부터 44 | 처럼 45 | 아 46 | 라 47 | 여 48 | 으로는 49 | 이고 50 | 에서의 51 | 이라는 52 | 만에 53 | 으로부터 54 | 에서도 55 | 와의 56 | 엔 57 | 만을 58 | 부터는 59 | 만의 60 | 야 61 | 까지의 62 | 과는 63 | 치고 64 | 과를 65 | 으로의 66 | 까지는 67 | 보다는 68 | 만이 69 | 에만 70 | 로의 -------------------------------------------------------------------------------- /tests/data/extractors/images/test_basic_image/test_basic_image.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://blogs.kusp.org/filmgang/2013/02/08/stand-up-guys/", 3 | "expected": { 4 | "top_image": { 5 | "extraction_type": "bigimage", 6 | "src": "http://md0.libe.com/photo/465395/?modified_at=1351411813&ratio_x=03&ratio_y=02&width=476", 7 | "confidence_score": 100, 8 | "bytes": 0, 9 | "height": 317, 10 | "width": 476, 11 | "top_image_node": null 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "NA", 7 | "src": "", 8 | "confidence_score": 0.0, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "opengraph", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 100, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "known", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 90, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "known", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 90, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "known", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 90, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "known", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 90, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://go.com/bla/bla", 3 | "expected": { 4 | "cleaned_text" : "TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start u", 5 | "top_image": { 6 | "extraction_type": "known", 7 | "src": "http://go.com/images/465395/", 8 | "confidence_score": 90, 9 | "bytes": 0, 10 | "height": 0, 11 | "width": 0, 12 | "top_image_node": null 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_politico.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.politico.com/news/stories/1010/43352.html", 3 | "expected": { 4 | "meta_description": "Demographic changes are likely to alter the route Obama took to victory in 2008.", 5 | "domain": "www.politico.com", 6 | "final_url": "http://www.politico.com/news/stories/1010/43352.html", 7 | "meta_keywords": "2012, Maggie Haberman and Shira Toeplitz", 8 | "cleaned_text": "If the newest Census Bureau estimates stay close to form", 9 | "meta_favicon": "http://www.politico.com/favicon.ico", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/tags/test_tags_cnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.cnet.com/8301-13952_1-57596170-81/the-404-1310-where-its-love-at-first-swipe-podcast/", 3 | "expected": { 4 | "tags": [ 5 | "purgatory", 6 | "USDATE", 7 | "Pope", 8 | "online dating", 9 | "leftovers", 10 | "app", 11 | "Yahoo", 12 | "OKCupid", 13 | "romance", 14 | "Pontifex", 15 | "Tinder", 16 | "Leftover Swap", 17 | "Match.com", 18 | "Twitter", 19 | "Marc Maron" 20 | ] 21 | } 22 | } -------------------------------------------------------------------------------- /tests/data/parser/test1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |

5 | xxxx aaaaa xxxxxxxx 6 | span span 7 |

8 |
9 |
10 |

11 | xxxx aaaaa xxxxxxxx 12 | span span 13 |

14 |
15 |
16 |

17 | xxxx aaaaa xxxxxxxx 18 | span span 19 |

20 |
21 |
22 |

23 | xxxx aaaaa xxxxxxxx 24 | span span 25 |

26 |

test

27 |
28 | 29 | 30 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_issue32.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7", 3 | "expected": { 4 | "meta_description": "", 5 | "domain": "www.tulsaworld.com", 6 | "final_url": "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7", 7 | "meta_keywords": "COURT RULE INFORMATION RECORDS DISTRICT OKLAHOMA PERSONAL PROPOSAL PROPOSED REASONS", 8 | "cleaned_text": "Opposition to a proposal to remove certain personal data", 9 | "meta_favicon": "/favicon.ico", 10 | "meta_lang": null 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_businessinsider3.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4", 3 | "expected": { 4 | "meta_description": "One day in July 2001, Larry Page decided to...", 5 | "domain": "www.businessinsider.com", 6 | "final_url": "http://www.businessinsider.com/larry-page-the-untold-story-2014-4", 7 | "meta_keywords": "Google, Larry Page, Longform, Nicholas Carlson,", 8 | "cleaned_text": "One day in July 2001, Larry Page decided to fire Google", 9 | "meta_favicon": "http://static5.businessinsider.com/assets/images/faviconBI.ico", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_espn.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://sports.espn.go.com/espn/commentary/news/story?id=5461430", 3 | "expected": { 4 | "meta_description": "Are Florida coach Urban Meyer and Alabama coach Nick Saban closing practice because of agents or because they like to control every aspect of their programs?", 5 | "domain": "sports.espn.go.com", 6 | "final_url": "http://sports.espn.go.com/espn/commentary/news/story?id=5461430", 7 | "meta_keywords": "", 8 | "cleaned_text": "If you believe what college football coaches have said about sports", 9 | "meta_favicon": "", 10 | "meta_lang": null 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_elmondo1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.elmundo.es/elmundo/2012/10/28/espana/1351388909.html", 3 | "target_language": "es", 4 | "expected": { 5 | "meta_description": "Detenida en Francia Izaskun Lesaka Izaskun Lesaka pas\u00f3 a la c\u00fapula tras la detenci\u00f3n de Ata. Su pareja y lugarteniente, Joseba Iturbe, tambi\u00e9n ha sido detenido.", 6 | "domain": "www.elmundo.es", 7 | "final_url": "http://www.elmundo.es/elmundo/2012/10/28/espana/1351388909.html", 8 | "meta_keywords": "Detenida, Francia, Izaskun, Lesaka, Espa\u00f1a", 9 | "cleaned_text": "Importante golpe a la banda terrorista ETA en Francia.", 10 | "meta_favicon": "", 11 | "meta_lang": null 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_liberation.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.liberation.fr/politiques/2012/10/27/ayrault-assume-et-revendique-sa-methode_856451", 3 | "expected": { 4 | "meta_description": "Apr\u00e8s une semaine agit\u00e9e, le Premier ministre s'est offert un succ\u00e8s d'estrade \u00e0 bon compte lors du congr\u00e8s du Parti socialiste \u00e0 Toulouse.", 5 | "domain": "www.liberation.fr", 6 | "final_url": "http://www.liberation.fr/politiques/2012/10/27/ayrault-assume-et-revendique-sa-methode_856451", 7 | "meta_keywords": "actualit\u00e9s, news", 8 | "cleaned_text": "A Toulouse, Jean-Marc Ayrault aura fait deux rappels sur", 9 | "meta_favicon": "", 10 | "meta_lang": "fr" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/title/test_title_empty.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 |

8 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 9 |

10 |
11 | 12 | 13 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-fi.txt: -------------------------------------------------------------------------------- 1 | alla 2 | ansiosta 3 | ehkä 4 | ei 5 | enemmän 6 | ennen 7 | etessa 8 | f 9 | haikki 10 | he 11 | hitaasti 12 | hoikein 13 | hyvin 14 | hän 15 | ilman 16 | ja 17 | jos 18 | jälkeen 19 | kanssa 20 | kaukana 21 | kenties 22 | keskellä 23 | kesken 24 | koskaan 25 | kuinkan 26 | kukka 27 | kylliksi 28 | kyllä 29 | liian 30 | lla 31 | lla 32 | luona 33 | lähellä 34 | läpi 35 | me 36 | miksi 37 | mikä 38 | milloin 39 | milloinkan 40 | minä 41 | missä 42 | miten 43 | nopeasti 44 | nyt 45 | oikea 46 | oikealla 47 | paljon 48 | siellä 49 | sinä 50 | ssa 51 | sta 52 | suoraan 53 | tai 54 | takana 55 | takia 56 | tarpeeksi 57 | te 58 | tässä 59 | ulkopuolella 60 | vahemmän 61 | vasen 62 | vasenmalla 63 | vastan 64 | vielä 65 | vieressä 66 | vähän 67 | yhdessä 68 | ylös 69 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_issue28.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html", 3 | "expected": { 4 | "meta_description": "A 'world's hottest chilli' competition at a curry restaurant left two people\n in hospital.", 5 | "domain": "www.telegraph.co.uk", 6 | "final_url": "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html", 7 | "meta_keywords": "Curry-competition, Food and Drink News,Food and Drink", 8 | "cleaned_text": "Emergency services were called to Kismot Restaurant's curry-eating challenge,", 9 | "meta_favicon": "", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_techcrunch1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/", 3 | "expected": { 4 | "meta_description": "", 5 | "domain": "techcrunch.com", 6 | "final_url": "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/", 7 | "meta_keywords": "", 8 | "cleaned_text": "The Huffington Post has come across this fascinating five-minute interview", 9 | "tags": [ 10 | "facebook" 11 | ], 12 | "title": "2005 Zuckerberg Didn\u2019t Want To Take Over The World", 13 | "meta_favicon": "http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1310283187g", 14 | "meta_lang": "en" 15 | } 16 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_businessWeek1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm", 3 | "expected": { 4 | "meta_description": "The Web and cable star has achieved fame by targeting nerdy guys, who she says \"control popularity\".", 5 | "domain": "www.businessweek.com", 6 | "final_url": "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm", 7 | "meta_keywords": "Olivia Munn, Attack of the Show, Jon Stewart, Daily Show, G4", 8 | "cleaned_text": "Six years ago, Olivia Munn arrived in Hollywood with fading ambitions of making it as a sports reporter and set about deploying", 9 | "title": "Olivia Munn: Queen of the Uncool", 10 | "meta_favicon": "", 11 | "meta_lang": "en" 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_cnn1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", 3 | "expected": { 4 | "meta_description": "Democrats pledged Friday to not only keep Social Security in place, but use the historic program against Republicans ahead of the midterm election.", 5 | "domain": "www.cnn.com", 6 | "final_url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", 7 | "meta_keywords": "", 8 | "cleaned_text": "Washington (CNN) -- Democrats pledged ", 9 | "title": "Democrats to use Social Security against GOP this fall - CNN.com", 10 | "meta_favicon": "http://i.cdn.turner.com/cnn/.element/img/3.0/global/misc/apple-touch-icon.png", 11 | "meta_lang": "en" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_businessWeek3.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.businessweek.com/technology/here-comes-apples-real-tv-09132011.html", 3 | "expected": { 4 | "meta_description": "A bold, new Apple TV set would replace today\u2019s cable systems, game consoles, and 3D goggles\u2014and launch a war with cable providers", 5 | "domain": "www.businessweek.com", 6 | "final_url": "http://www.businessweek.com/technology/here-comes-apples-real-tv-09132011.html", 7 | "meta_keywords": "Apple, Apple CEO, Google, Television, Cable & Wireless, Netflix, Steve Jobs, Comcast, cable, cable TV, hulu, Roku", 8 | "cleaned_text": "Get ready, America, because by Christmas 2012 you will have an Apple TV in your living room", 9 | "meta_favicon": "", 10 | "meta_lang": null 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_yahoo.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://news.yahoo.com/apple-says-steve-jobs-resigning-ceo-224628633.html", 3 | "expected": { 4 | "meta_description": "Read 'Apple says Steve Jobs resigning as CEO' on Yahoo! News. Steve Jobs, the mind behind the iPhone, iPad and other devices that turned Apple Inc. into one of the world's most powerful companies, resigned as CEO on Wednesday, saying he can no longer handle the job but will continue to play a leadership role.", 5 | "domain": "news.yahoo.com", 6 | "final_url": "http://news.yahoo.com/apple-says-steve-jobs-resigning-ceo-224628633.html", 7 | "meta_keywords": "", 8 | "cleaned_text": "SAN FRANCISCO (AP) \u2014 Steve Jobs, the mind behind the iPhone", 9 | "meta_favicon": "", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_time.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", 3 | "expected": { 4 | "meta_description": "Researchers at the University of Georgia believe that much of the oil from the BP spill is still present underwater in the Gulf of Mexico, where its impact on aquatic life is far from clear", 5 | "domain": "www.time.com", 6 | "final_url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", 7 | "meta_keywords": "bp, oil, spill, gulf, mexico, invisible, dispersed, deepwater horizon, Charles Hopkinson", 8 | "cleaned_text": "This month, the federal government released", 9 | "title": "Oil from Spill Could Still Pose Major Threat", 10 | "meta_favicon": "http://img.timeinc.net/time/favicon.ico", 11 | "meta_lang": null 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_foxNews.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.foxnews.com/politics/2010/08/14/russias-nuclear-help-iran-stirs-questions-improved-relations/", 3 | "expected": { 4 | "meta_description": "Russia's announcement that it will help Iran get nuclear fuel is raising questions about the better-than- ever relationship between Russia and the U.S. , according to President Obama, after the two former Cold War adversaries recently signed a nuclear reduction treaty.", 5 | "domain": "www.foxnews.com", 6 | "final_url": "http://www.foxnews.com/politics/2010/08/14/russias-nuclear-help-iran-stirs-questions-improved-relations/", 7 | "meta_keywords": "", 8 | "cleaned_text": "Russia's announcement that it will help Iran get nuclear fuel is raising questions", 9 | "meta_favicon": "", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_businessWeek2.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.businessweek.com/management/five-social-media-lessons-for-business-09202011.html", 3 | "expected": { 4 | "meta_description": "A Home Depot executive discusses the retailer's strategy for engaging consumers via Facebook, Twitter, and blogs, relying on store associates for much of the social interaction", 5 | "domain": "www.businessweek.com", 6 | "final_url": "http://www.businessweek.com/management/five-social-media-lessons-for-business-09202011.html", 7 | "meta_keywords": "Facebook, Twitter, social media, Home Depot, retailers, social media lessons", 8 | "cleaned_text": "At Home Depot, we first realized we needed to have a real conversation with", 9 | "title": "Five Social Media Lessons for Business", 10 | "meta_favicon": "", 11 | "meta_lang": null 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_cnbc1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.cnbc.com/id/44613978", 3 | "expected": { 4 | "meta_description": "The Fed launched much anticipated \"operation twist\" with a twist\u2014it is also taking direct aim at mortgages. The Fed also warned of significant downside economic risks. Check out how it changed its statement.", 5 | "domain": "www.cnbc.com", 6 | "final_url": "http://www.cnbc.com/id/44613978", 7 | "meta_keywords": "Bonds,Economy,Interest Rates,Economy (Global),Banking,Economic Measures,Debt,Central Banks,Ben Bernanke,Federal Reserve,Currencies,Investment Strategy,Top Blogs", 8 | "cleaned_text": "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.", 9 | "meta_favicon": "http://media.cnbc.com/i/CNBC/CNBC_Images/mobile_images/cnbc_iphone_icon.png", 10 | "meta_lang": null 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_cbslocal.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://newyork.cbslocal.com/2012/06/08/bc-morning-show-american-hero-kelly-malloy/", 3 | "expected": { 4 | "meta_description": "Boomer & Craig were thrilled to welcome an American Hero into the Allstate Studio, as Kelly Malloy stopped-by and was given the royal treatment she deserved...", 5 | "domain": "newyork.cbslocal.com", 6 | "final_url": "http://newyork.cbslocal.com/2012/06/08/bc-morning-show-american-hero-kelly-malloy/", 7 | "meta_keywords": "vibNews", 8 | "cleaned_text": "Boomer & Craig were thrilled to welcome an American Hero into the Allstate Studio, as Kelly", 9 | "tags": [ 10 | "Boomer & Carton", 11 | "Kelly Malloy", 12 | "Bobby Dwyer" 13 | ], 14 | "meta_favicon": "http://s2.wp.com/i/favicon.ico?m=1311976027g", 15 | "meta_lang": "en" 16 | } 17 | } -------------------------------------------------------------------------------- /tests/data/extractors/title/test_title_opengraph.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Wrong article title - website 6 | 7 | 8 |
9 |

10 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 11 |

12 |
13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_huffingtonPost2.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.huffingtonpost.com/2011/10/06/alabama-workers-immigration-law_n_997793.html", 3 | "expected": { 4 | "meta_description": "MONTGOMERY, Ala. -- Alabama's strict new immigration law may be backfiring. Intended to force illegal workers out of jobs, it is also driving away many construction workers, roofers and field hands in the country legally who do backbreaking jobs that Americans generally won't.", 5 | "domain": "www.huffingtonpost.com", 6 | "final_url": "http://www.huffingtonpost.com/2011/10/06/alabama-workers-immigration-law_n_997793.html", 7 | "meta_keywords": "alabama, workers, leave, state, as, immigration, law, takes, effect, business", 8 | "cleaned_text": "MONTGOMERY, Ala. -- Alabama's strict new immigration law may be backfiring.", 9 | "meta_favicon": "/favicon.ico", 10 | "meta_lang": "en" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_allnewlyrics1.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://allnewlyrics.com/only-one-lyrics-pj-morton-ft-stevie-wonder.html", 3 | "expected": { 4 | "meta_description": "All about hot new song with lyrics at AllNewLyrics.Com", 5 | "domain": "allnewlyrics.com", 6 | "final_url": "http://allnewlyrics.com/only-one-lyrics-pj-morton-ft-stevie-wonder.html", 7 | "meta_keywords": "Music, Songs, Lyrics, Letras, Lirik, Tekst, Text, Testo, Paroles, Popular, New", 8 | "cleaned_text": "PJ Morton \u2013 Only One Lyrics (Ft. Stevie Wonder)\n\nI\u2019m pretty sure I don\u2019t need anything else\n\n This is the best feeling I\u2019ve ever felt", 9 | "tags": [ 10 | "PJ Morton", 11 | "Stevie Wonder" 12 | ], 13 | "title": "\u201cOnly One\u201d Lyrics : PJ Morton (Ft. Stevie Wonder)", 14 | "meta_favicon": "", 15 | "meta_lang": "en" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_issue4.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.slate.fr/story/64063/tapie-mougeotte-la-provence", 3 | "target_language": "fr", 4 | "expected": { 5 | "meta_description": "L'ex-pr\u00e9sident de l'OM et l'ancien PDG de TF1 s'int\u00e9resseraient au rachat du quotidien r\u00e9gional. Nous vous proposons, comme au lyc\u00e9e, un exercice pour en d\u00e9gager la signification.", 6 | "domain": "www.slate.fr", 7 | "final_url": "http://www.slate.fr/story/64063/tapie-mougeotte-la-provence", 8 | "meta_keywords": "FRANCE,Bernard Tapie,Etienne Mougeotte,presse,Qatar,Cr\u00e9dit Lyonnaus,La Provence,aides de l'Etat,politique,marseille,m\u00e9dias fran\u00e7ais,", 9 | "cleaned_text": "Exercice: apr\u00e8s avoir attentivement lu cette br\u00e8ve parue dans L'Express, vous expliquerez en quoi elle r\u00e9sume une certaine id\u00e9e de la France.\n\n\u00abBernar", 10 | "meta_favicon": "", 11 | "meta_lang": "fr" 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/extractors/content/test_cnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://news.cnet.com/8301-30686_3-20014053-266.html?tag=topStories1", 3 | "expected": { 4 | "meta_description": "The phone company is adding bells and whistles to its Fios TV service, including an app that turns an iPad into a TV and several other new options for taking your video content on the go. Read this blog post by Marguerite Reardon on Signal Strength.", 5 | "domain": "news.cnet.com", 6 | "final_url": "http://news.cnet.com/8301-30686_3-20014053-266.html?tag=topStories1", 7 | "meta_keywords": "Marguerite Reardon, wireless, broadband, telecom", 8 | "cleaned_text": "NEW YORK--Verizon Communications is prepping a new", 9 | "tags": [ 10 | "iPad", 11 | "Verizon Communications", 12 | "Verizon Fios TV", 13 | "Fios", 14 | "Apple iPad" 15 | ], 16 | "meta_favicon": "", 17 | "meta_lang": null 18 | } 19 | } -------------------------------------------------------------------------------- /tests/data/extractors/publishdate/test_publish_date_schema.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test video 5 | 6 | 7 | 8 |
9 | 10 |

11 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 12 |

13 |
14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/data/extractors/videos/test_iframe.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://foo.bar/index.html", 3 | "expected": { 4 | "movies": [ 5 | { 6 | "src": "http://www.dailymotion.com/embed/video/x130bpf", 7 | "embed_code": " 14 |

15 |

16 | Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. 17 | For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin. 18 |

19 |

20 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. 21 |

22 |

23 | 24 |

25 |

26 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. 27 | In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day. 28 |

29 |

30 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle. 31 |

32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/data/extractors/content/test_elpais.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://www.sociedad.elpais.com/sociedad/2012/10/27/actualidad/1351332873_157836.html", 3 | "target_language": "es", 4 | "expected": { 5 | "meta_description": "Los recortes elevan la demora para operarse un 125% en a\u00f1o y medio. El n\u00famero de pacientes que esperan m\u00e1s de 180 d\u00edas para entrar en quir\u00f3fano crece un 178%", 6 | "domain": "www.sociedad.elpais.com", 7 | "final_url": "http://www.sociedad.elpais.com/sociedad/2012/10/27/actualidad/1351332873_157836.html", 8 | "meta_keywords": "lista, espera, agravar, recorte, elevar, demora, operar, 125 %, a\u00f1o, medio, n\u00famero, paciente, aguardar, 180, d\u00eda, entrar, quir\u00f3fano, crecer, 178 %", 9 | "cleaned_text": "Los recortes pasan factura a los pacientes.", 10 | "tags": [ 11 | "Asistencia sanitaria", 12 | "Igualdad", 13 | "Copa Davis", 14 | "Copa del Rey de F\u00fatbol", 15 | "Motociclismo", 16 | "Sistema sanitario", 17 | "Defensor del Lector", 18 | "Cine", 19 | "Vacunaci\u00f3n", 20 | "Giro de Italia", 21 | "Comunicaci\u00f3n", 22 | "Elecciones EE UU 2012", 23 | "Sanidad", 24 | "\u00daLTIMA HORA", 25 | "Columnas", 26 | "Pol\u00edtica social", 27 | "Medicina", 28 | "Ciencia", 29 | "Wimbledon", 30 | "Educaci\u00f3n", 31 | "US Open", 32 | "Videos Champions", 33 | "Administraci\u00f3n auton\u00f3mica", 34 | "Oscars", 35 | "Farmacias", 36 | "Salud", 37 | "M\u00e1s temas \u00bb", 38 | "Especialidades m\u00e9dicas", 39 | "F\u00fatbol", 40 | "Europa Convulsa", 41 | "Roland Garros", 42 | "Golf", 43 | "Gastronom\u00eda", 44 | "Copa del Rey Basket", 45 | "Tour de Francia", 46 | "Vuelta Espa\u00f1a", 47 | "C\u00e1ritas", 48 | "Otros Deportes", 49 | "F\u00f3rmula 1", 50 | "Champions League", 51 | "Ciclismo", 52 | "27 OCT 2012 - 12:14 CET", 53 | "Listas espera", 54 | "Editoriales", 55 | "RTVE", 56 | "Custodia hijos", 57 | "Juegos Ol\u00edmpicos", 58 | "Titulares \u00bb", 59 | "Vi\u00f1etas", 60 | "Sanidad p\u00fablica", 61 | "Atenci\u00f3n al paciente", 62 | "Tribunas", 63 | "Moda", 64 | "M\u00fasica", 65 | "Tenis", 66 | "Medio Ambiente", 67 | "Recortes sociales", 68 | "Teatro/Danza", 69 | "Baloncesto", 70 | "Comunidades aut\u00f3nomas", 71 | "Selecci\u00f3n Espa\u00f1ola", 72 | "Open Australia", 73 | "El Espa\u00f1ol", 74 | "Consumo", 75 | "Enfermedades raras", 76 | "Huelga General", 77 | "Declaracion Renta", 78 | "Elecciones Francia 2012", 79 | "Sociedad", 80 | "Elecciones Generales", 81 | "Libros", 82 | "Coches con Estilo", 83 | "El final de ETA", 84 | "Administraci\u00f3n p\u00fablica", 85 | "Centrales nucleares", 86 | "\u00cdndice", 87 | "Pacientes" 88 | ], 89 | "meta_favicon": "http://ep01.epimg.net/favicon.png", 90 | "meta_lang": "es" 91 | } 92 | } -------------------------------------------------------------------------------- /tests/data/extractors/videos/test_embed.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test video 5 | 6 | 7 | 8 |
9 |

10 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 11 |

12 |

13 | 17 |

18 |

19 | Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. 20 | For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin. 21 |

22 |

23 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. 24 |

25 |

26 | 30 |

31 |

32 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. 33 | In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day. 34 |

35 |

36 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle. 37 |

38 |
39 | 40 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-es.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | que 4 | el 5 | en 6 | y 7 | a 8 | los 9 | del 10 | se 11 | las 12 | por 13 | un 14 | para 15 | con 16 | no 17 | una 18 | su 19 | al 20 | lo 21 | como 22 | más 23 | pero 24 | sus 25 | le 26 | ya 27 | o 28 | este 29 | sí 30 | porque 31 | esta 32 | entre 33 | cuando 34 | muy 35 | sin 36 | sobre 37 | también 38 | me 39 | hasta 40 | hay 41 | donde 42 | quien 43 | desde 44 | todo 45 | nos 46 | durante 47 | todos 48 | uno 49 | les 50 | ni 51 | contra 52 | otros 53 | ese 54 | eso 55 | ante 56 | ellos 57 | e 58 | esto 59 | mí 60 | antes 61 | algunos 62 | qué 63 | unos 64 | yo 65 | otro 66 | otras 67 | otra 68 | él 69 | tanto 70 | esa 71 | estos 72 | mucho 73 | quienes 74 | nada 75 | muchos 76 | cual 77 | poco 78 | ella 79 | estar 80 | estas 81 | algunas 82 | algo 83 | nosotros 84 | mi 85 | mis 86 | tú 87 | te 88 | ti 89 | tu 90 | tus 91 | ellas 92 | nosotras 93 | vosotros 94 | vosotras 95 | os 96 | mío 97 | mía 98 | míos 99 | mías 100 | tuyo 101 | tuya 102 | tuyos 103 | tuyas 104 | suyo 105 | suya 106 | suyos 107 | suyas 108 | nuestro 109 | nuestra 110 | nuestros 111 | nuestras 112 | vuestro 113 | vuestra 114 | vuestros 115 | vuestras 116 | esos 117 | esas 118 | estoy 119 | estás 120 | está 121 | estamos 122 | estáis 123 | están 124 | esté 125 | estés 126 | estemos 127 | estéis 128 | estén 129 | estaré 130 | estarás 131 | estará 132 | estaremos 133 | estaréis 134 | estarán 135 | estaría 136 | estarías 137 | estaríamos 138 | estaríais 139 | estarían 140 | estaba 141 | estabas 142 | estábamos 143 | estabais 144 | estaban 145 | estuve 146 | estuviste 147 | estuvo 148 | estuvimos 149 | estuvisteis 150 | estuvieron 151 | estuviera 152 | estuvieras 153 | estuviéramos 154 | estuvierais 155 | estuvieran 156 | estuviese 157 | estuvieses 158 | estuviésemos 159 | estuvieseis 160 | estuviesen 161 | estando 162 | estado 163 | estada 164 | estados 165 | estadas 166 | estad 167 | he 168 | has 169 | ha 170 | hemos 171 | habéis 172 | han 173 | haya 174 | hayas 175 | hayamos 176 | hayáis 177 | hayan 178 | habré 179 | habrás 180 | habrá 181 | habremos 182 | habréis 183 | habrán 184 | habría 185 | habrías 186 | habríamos 187 | habríais 188 | habrían 189 | había 190 | habías 191 | habíamos 192 | habíais 193 | habían 194 | hube 195 | hubiste 196 | hubo 197 | hubimos 198 | hubisteis 199 | hubieron 200 | hubiera 201 | hubieras 202 | hubiéramos 203 | hubierais 204 | hubieran 205 | hubiese 206 | hubieses 207 | hubiésemos 208 | hubieseis 209 | hubiesen 210 | habiendo 211 | habido 212 | habida 213 | habidos 214 | habidas 215 | 216 | # forms of ser, to be (not including the infinitive): 217 | soy 218 | eres 219 | es 220 | somos 221 | sois 222 | son 223 | sea 224 | seas 225 | seamos 226 | seáis 227 | sean 228 | seré 229 | serás 230 | será 231 | seremos 232 | seréis 233 | serán 234 | sería 235 | serías 236 | seríamos 237 | seríais 238 | serían 239 | era 240 | eras 241 | éramos 242 | erais 243 | eran 244 | fui 245 | fuiste 246 | fue 247 | fuimos 248 | fuisteis 249 | fueron 250 | fuera 251 | fueras 252 | fuéramos 253 | fuerais 254 | fueran 255 | fuese 256 | fueses 257 | fuésemos 258 | fueseis 259 | fuesen 260 | siendo 261 | sido 262 | tengo 263 | tienes 264 | tiene 265 | tenemos 266 | tenéis 267 | tienen 268 | tenga 269 | tengas 270 | tengamos 271 | tengáis 272 | tengan 273 | tendré 274 | tendrás 275 | tendrá 276 | tendremos 277 | tendréis 278 | tendrán 279 | tendría 280 | tendrías 281 | tendríamos 282 | tendríais 283 | tendrían 284 | tenía 285 | tenías 286 | teníamos 287 | teníais 288 | tenían 289 | tuve 290 | tuviste 291 | tuvo 292 | tuvimos 293 | tuvisteis 294 | tuvieron 295 | tuviera 296 | tuvieras 297 | tuviéramos 298 | tuvierais 299 | tuvieran 300 | tuviese 301 | tuvieses 302 | tuviésemos 303 | tuvieseis 304 | tuviesen 305 | teniendo 306 | tenido 307 | tenida 308 | tenidos 309 | tenidas 310 | tened 311 | -------------------------------------------------------------------------------- /goose/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import os 24 | import platform 25 | from tempfile import mkstemp 26 | 27 | from goose.version import version_info, __version__ 28 | from goose.configuration import Configuration 29 | from goose.crawler import CrawlCandidate 30 | from goose.crawler import Crawler 31 | 32 | 33 | class Goose(object): 34 | """\ 35 | 36 | """ 37 | def __init__(self, config=None): 38 | self.config = config or Configuration() 39 | self.extend_config() 40 | self.initialize() 41 | 42 | def extend_config(self): 43 | if isinstance(self.config, dict): 44 | config = Configuration() 45 | for k, v in self.config.items(): 46 | if hasattr(config, k): 47 | setattr(config, k, v) 48 | self.config = config 49 | 50 | def extract(self, url=None, raw_html=None): 51 | """\ 52 | Main method to extract an article object from a URL, 53 | pass in a url and get back a Article 54 | """ 55 | cc = CrawlCandidate(self.config, url, raw_html) 56 | return self.crawl(cc) 57 | 58 | def shutdown_network(self): 59 | pass 60 | 61 | def crawl(self, crawl_candiate): 62 | parsers = list(self.config.available_parsers) 63 | parsers.remove(self.config.parser_class) 64 | try: 65 | crawler = Crawler(self.config) 66 | article = crawler.crawl(crawl_candiate) 67 | except (UnicodeDecodeError, ValueError): 68 | self.config.parser_class = parsers[0] 69 | return self.crawl(crawl_candiate) 70 | return article 71 | 72 | def initialize(self): 73 | # we don't need to go further if image extractor or 74 | # local_storage is not set 75 | if not self.config.local_storage_path or \ 76 | not self.config.enable_image_fetching: 77 | return 78 | # test if config.local_storage_path 79 | # is a directory 80 | if not os.path.isdir(self.config.local_storage_path): 81 | os.makedirs(self.config.local_storage_path) 82 | 83 | if not os.path.isdir(self.config.local_storage_path): 84 | raise Exception(self.config.local_storage_path + 85 | " directory does not seem to exist, " 86 | "you need to set this for image processing downloads" 87 | ) 88 | 89 | # test to write a dummy file to the directory 90 | # to check is directory is writtable 91 | level, path = mkstemp(dir=self.config.local_storage_path) 92 | try: 93 | f = os.fdopen(level, "w") 94 | f.close() 95 | os.remove(path) 96 | except IOError: 97 | raise Exception(self.config.local_storage_path + 98 | " directory is not writeble, " 99 | "you need to set this for image processing downloads" 100 | ) 101 | -------------------------------------------------------------------------------- /goose/extractors/title.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import re 24 | 25 | from goose.extractors import BaseExtractor 26 | 27 | 28 | TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] 29 | 30 | 31 | class TitleExtractor(BaseExtractor): 32 | 33 | def clean_title(self, title): 34 | """Clean title with the use of og:site_name 35 | in this case try to get rid of site name 36 | and use TITLE_SPLITTERS to reformat title 37 | """ 38 | # check if we have the site name in opengraph data 39 | if "site_name" in self.article.opengraph.keys(): 40 | site_name = self.article.opengraph['site_name'] 41 | # remove the site name from title 42 | title = title.replace(site_name, '').strip() 43 | 44 | # try to remove the domain from url 45 | if self.article.domain: 46 | pattern = re.compile(self.article.domain, re.IGNORECASE) 47 | title = pattern.sub("", title).strip() 48 | 49 | # split the title in words 50 | # TechCrunch | my wonderfull article 51 | # my wonderfull article | TechCrunch 52 | title_words = title.split() 53 | 54 | # check for an empty title 55 | # so that we don't get an IndexError below 56 | if len(title_words) == 0: 57 | return u"" 58 | 59 | # check if first letter is in TITLE_SPLITTERS 60 | # if so remove it 61 | if title_words[0] in TITLE_SPLITTERS: 62 | title_words.pop(0) 63 | 64 | # check if last letter is in TITLE_SPLITTERS 65 | # if so remove it 66 | if title_words[-1] in TITLE_SPLITTERS: 67 | title_words.pop(-1) 68 | 69 | # rebuild the title 70 | title = u" ".join(title_words).strip() 71 | 72 | return title 73 | 74 | def get_title(self): 75 | """\ 76 | Fetch the article title and analyze it 77 | """ 78 | title = '' 79 | 80 | # rely on opengraph in case we have the data 81 | if "title" in self.article.opengraph.keys(): 82 | title = self.article.opengraph['title'] 83 | return self.clean_title(title) 84 | 85 | # try to fetch the meta headline 86 | meta_headline = self.parser.getElementsByTag( 87 | self.article.doc, 88 | tag="meta", 89 | attr="name", 90 | value="headline") 91 | if meta_headline is not None and len(meta_headline) > 0: 92 | title = self.parser.getAttribute(meta_headline[0], 'content') 93 | return self.clean_title(title) 94 | 95 | # otherwise use the title meta 96 | title_element = self.parser.getElementsByTag(self.article.doc, tag='title') 97 | if title_element is not None and len(title_element) > 0: 98 | title = self.parser.getText(title_element[0]) 99 | return self.clean_title(title) 100 | 101 | return title 102 | 103 | def extract(self): 104 | return self.get_title() 105 | -------------------------------------------------------------------------------- /tests/data/extractors/videos/test_object.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test video 5 | 6 | 7 | 8 |
9 |

10 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 11 |

12 |

13 | 14 | 15 | 16 | 20 | 21 |

22 |

23 | Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. 24 | For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin. 25 |

26 |

27 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. 28 |

29 |

30 | 31 | 32 | 33 | 34 |

35 |

36 | 40 |

41 |

42 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. 43 | In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day. 44 |

45 |

46 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle. 47 |

48 |
49 | 50 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-hu.txt: -------------------------------------------------------------------------------- 1 | a 2 | á 3 | ahogy 4 | ahol 5 | aki 6 | akik 7 | akkor 8 | alatt 9 | által 10 | általában 11 | amely 12 | amelyek 13 | amelyekben 14 | amelyeket 15 | amelyet 16 | amelynek 17 | ami 18 | amit 19 | amolyan 20 | amp 21 | amíg 22 | amikor 23 | át 24 | abban 25 | ahhoz 26 | annak 27 | arra 28 | arról 29 | az 30 | azok 31 | azon 32 | azt 33 | azzal 34 | azért 35 | aztán 36 | azután 37 | azonban 38 | b 39 | bár 40 | be 41 | belül 42 | benne 43 | c 44 | cikk 45 | cikkek 46 | cikkeket 47 | csak 48 | d 49 | de 50 | e 51 | é 52 | eddig 53 | egész 54 | egy 55 | egyes 56 | egyetlen 57 | egyéb 58 | egyik 59 | egyre 60 | ekkor 61 | el 62 | elég 63 | ellen 64 | elő 65 | először 66 | előtt 67 | első 68 | én 69 | éppen 70 | ebben 71 | ehhez 72 | emilyen 73 | ennek 74 | erre 75 | ez 76 | ezt 77 | ezek 78 | ezen 79 | ezzel 80 | ezért 81 | és 82 | f 83 | fel 84 | felé 85 | g 86 | h 87 | hanem 88 | hiszen 89 | hogy 90 | hogyan 91 | i 92 | í 93 | igen 94 | így 95 | illetve 96 | ill. 97 | ill 98 | ilyen 99 | ilyenkor 100 | is 101 | ison 102 | ismét 103 | itt 104 | j 105 | jó 106 | jól 107 | jobban 108 | k 109 | kell 110 | kellett 111 | keresztül 112 | keressünk 113 | ki 114 | kívül 115 | között 116 | közül 117 | l 118 | legalább 119 | lehet 120 | lehetett 121 | legyen 122 | lenne 123 | lenni 124 | lesz 125 | lett 126 | m 127 | maga 128 | magát 129 | majd 130 | majd 131 | már 132 | más 133 | másik 134 | meg 135 | még 136 | mellett 137 | mert 138 | mely 139 | melyek 140 | mi 141 | mit 142 | míg 143 | miért 144 | milyen 145 | mikor 146 | minden 147 | mindent 148 | mindenki 149 | mindig 150 | mint 151 | mintha 152 | mivel 153 | most 154 | n 155 | nagy 156 | nagyobb 157 | nagyon 158 | ne 159 | néha 160 | nekem 161 | neki 162 | nem 163 | néhány 164 | nélkül 165 | nincs 166 | o 167 | ó 168 | olyan 169 | ott 170 | össze 171 | ö 172 | ő 173 | ők 174 | őket 175 | p 176 | pedig 177 | persze 178 | q 179 | r 180 | rá 181 | s 182 | saját 183 | sem 184 | semmi 185 | sok 186 | sokat 187 | sokkal 188 | sz 189 | számára 190 | szemben 191 | szerint 192 | szinte 193 | t 194 | talán 195 | tehát 196 | teljes 197 | tovább 198 | továbbá 199 | több 200 | u 201 | ú 202 | úgy 203 | ugyanis 204 | új 205 | újabb 206 | újra 207 | után 208 | utána 209 | utolsó 210 | ü 211 | ű 212 | v 213 | vagy 214 | vagyis 215 | valaki 216 | valamely 217 | valami 218 | valamint 219 | való 220 | vagyok 221 | van 222 | vannak 223 | volt 224 | voltam 225 | voltak 226 | voltunk 227 | vissza 228 | vele 229 | viszont 230 | volna 231 | számolnak 232 | szólnak 233 | szól 234 | w 235 | x 236 | y 237 | z 238 | zs 239 | a 240 | ahogy 241 | ahol 242 | aki 243 | akkor 244 | alatt 245 | általában 246 | által 247 | amely 248 | amíg 249 | amikor 250 | ami 251 | amolyan 252 | arra 253 | át 254 | az 255 | azért 256 | azonban 257 | azon 258 | aztán 259 | azt 260 | azután 261 | azzal 262 | bár 263 | be 264 | belül 265 | benne 266 | cikk 267 | csak 268 | de 269 | eddig 270 | egész 271 | egy 272 | egyéb 273 | egyes 274 | egyetlen 275 | egyik 276 | egyre 277 | ekkor 278 | el 279 | elég 280 | ellen 281 | elő 282 | először 283 | előtt 284 | első 285 | emilyen 286 | én 287 | éppen 288 | erre 289 | és 290 | e 291 | ez 292 | ezen 293 | ezért 294 | ezzel 295 | fel 296 | felé 297 | hanem 298 | hiszen 299 | hogy 300 | hogyan 301 | igen 302 | így 303 | ill. 304 | illetve 305 | ill 306 | ilyen 307 | ilyenkor 308 | ismét 309 | ison 310 | itt 311 | jó 312 | jobban 313 | jól 314 | kell 315 | keres 316 | keresztül 317 | ki 318 | kívül 319 | között 320 | közül 321 | legalább 322 | legyen 323 | lehet 324 | lenni 325 | lett 326 | maga 327 | maga 328 | majd 329 | már 330 | más 331 | másik 332 | még 333 | meg 334 | mellett 335 | mely 336 | mert 337 | miért 338 | míg 339 | mikor 340 | milyen 341 | minden 342 | mindenki 343 | mindig 344 | mi 345 | mint 346 | mintha 347 | mivel 348 | most 349 | nagy 350 | nagyobb 351 | nagyon 352 | ne 353 | néha 354 | néhány 355 | neki 356 | nélkül 357 | nem 358 | nincs 359 | ők 360 | olyan 361 | ő 362 | össze 363 | ott 364 | pedig 365 | persze 366 | rá 367 | saját 368 | s 369 | sem 370 | semmi 371 | sokkal 372 | sok 373 | számára 374 | számol 375 | szemben 376 | szerint 377 | szinte 378 | szól 379 | talán 380 | tehát 381 | teljes 382 | továbbá 383 | tovább 384 | úgy 385 | ugyanis 386 | új 387 | újabb 388 | újra 389 | utána 390 | után 391 | utolsó 392 | vagy 393 | vagyis 394 | valaki 395 | valamely 396 | valami 397 | valamint 398 | való 399 | van 400 | vissza 401 | viszont 402 | volt 403 | 404 | -------------------------------------------------------------------------------- /goose/configuration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import os 24 | import tempfile 25 | from goose.text import StopWords 26 | from goose.parsers import Parser 27 | from goose.parsers import ParserSoup 28 | from goose.version import __version__ 29 | 30 | HTTP_DEFAULT_TIMEOUT = 30 31 | 32 | AVAILABLE_PARSERS = { 33 | 'lxml': Parser, 34 | 'soup': ParserSoup, 35 | } 36 | 37 | 38 | class Configuration(object): 39 | 40 | def __init__(self): 41 | # What's the minimum bytes for an image we'd accept is, 42 | # alot of times we want to filter out the author's little images 43 | # in the beginning of the article 44 | self.images_min_bytes = 4500 45 | 46 | # set this guy to false if you don't care about getting images, 47 | # otherwise you can either use the default 48 | # image extractor to implement the ImageExtractor 49 | # interface to build your own 50 | self.enable_image_fetching = True 51 | 52 | # set this valriable to False if you want to force 53 | # the article language. OtherWise it will attempt to 54 | # find meta language and use the correct stopwords dictionary 55 | self.use_meta_language = True 56 | 57 | # default language 58 | # it will be use as fallback 59 | # if use_meta_language is set to false, targetlanguage will 60 | # be use 61 | self.target_language = 'en' 62 | 63 | # defautl stopwrods class 64 | self.stopwords_class = StopWords 65 | 66 | # path to your imagemagick convert executable, 67 | # on the mac using mac ports this is the default listed 68 | self.imagemagick_convert_path = "/opt/local/bin/convert" 69 | 70 | # path to your imagemagick identify executable 71 | self.imagemagick_identify_path = "/opt/local/bin/identify" 72 | 73 | # used as the user agent that 74 | # is sent with your web requests to extract an article 75 | # self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\ 76 | # " AppleWebKit/534.52.7 (KHTML, like Gecko) "\ 77 | # "Version/5.1.2 Safari/534.52.7" 78 | self.browser_user_agent = 'Goose/%s' % __version__ 79 | 80 | # debug mode 81 | # enable this to have additional debugging information 82 | # sent to stdout 83 | self.debug = False 84 | 85 | # TODO 86 | self.extract_publishdate = None 87 | 88 | # TODO 89 | self.additional_data_extractor = None 90 | 91 | # Parser type 92 | self.available_parsers = AVAILABLE_PARSERS.keys() 93 | self.parser_class = 'lxml' 94 | 95 | # set the local storage path 96 | # make this configurable 97 | self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose') 98 | 99 | # http timeout 100 | self.http_timeout = HTTP_DEFAULT_TIMEOUT 101 | 102 | def get_parser(self): 103 | return AVAILABLE_PARSERS[self.parser_class] 104 | 105 | def get_additionaldata_extractor(self): 106 | return self.additional_data_extractor 107 | 108 | def set_additionaldata_extractor(self, extractor): 109 | """\ 110 | Pass in to extract any additional data not defined within 111 | @param extractor a concrete instance of AdditionalDataExtractor 112 | """ 113 | if not extractor: 114 | raise ValueError("extractor must not be null!") 115 | self.additional_data_extractor = extractor 116 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-it.txt: -------------------------------------------------------------------------------- 1 | ad 2 | al 3 | allo 4 | ai 5 | agli 6 | all 7 | agl 8 | alla 9 | alle 10 | con 11 | col 12 | coi 13 | da 14 | dal 15 | dallo 16 | dai 17 | dagli 18 | dall 19 | dagl 20 | dalla 21 | dalle 22 | di 23 | del 24 | dello 25 | dei 26 | degli 27 | dell 28 | degl 29 | della 30 | delle 31 | in 32 | nel 33 | nello 34 | nei 35 | negli 36 | nell 37 | negl 38 | nella 39 | nelle 40 | su 41 | sul 42 | sullo 43 | sui 44 | sugli 45 | sull 46 | sugl 47 | sulla 48 | sulle 49 | per 50 | tra 51 | contro 52 | io 53 | tu 54 | lui 55 | lei 56 | noi 57 | voi 58 | loro 59 | mio 60 | mia 61 | miei 62 | mie 63 | tuo 64 | tua 65 | tuoi 66 | tue 67 | suo 68 | sua 69 | suoi 70 | sue 71 | nostro 72 | nostra 73 | nostri 74 | nostre 75 | vostro 76 | vostra 77 | vostri 78 | vostre 79 | mi 80 | ti 81 | ci 82 | vi 83 | lo 84 | la 85 | li 86 | le 87 | gli 88 | ne 89 | il 90 | un 91 | uno 92 | una 93 | ma 94 | ed 95 | se 96 | perchè 97 | perché 98 | perche 99 | anche 100 | come 101 | dov 102 | dove 103 | che 104 | chi 105 | cui 106 | non 107 | più 108 | piu 109 | quale 110 | quanto 111 | quanti 112 | quanta 113 | quante 114 | quello 115 | quelli 116 | quella 117 | quelle 118 | questo 119 | questi 120 | questa 121 | queste 122 | si 123 | tutto 124 | tutti 125 | a 126 | c 127 | e 128 | i 129 | l 130 | o 131 | ho 132 | hai 133 | ha 134 | abbiamo 135 | avete 136 | hanno 137 | abbia 138 | abbiate 139 | abbiano 140 | avrò 141 | avro 142 | avrai 143 | avrà 144 | avra 145 | avremo 146 | avrete 147 | avranno 148 | avrei 149 | avresti 150 | avrebbe 151 | avremmo 152 | avreste 153 | avrebbero 154 | avevo 155 | avevi 156 | aveva 157 | avevamo 158 | avevate 159 | avevano 160 | ebbi 161 | avesti 162 | ebbe 163 | avemmo 164 | aveste 165 | ebbero 166 | avessi 167 | avesse 168 | avessimo 169 | avessero 170 | avendo 171 | avuto 172 | avuta 173 | avuti 174 | avute 175 | sono 176 | sei 177 | è 178 | é 179 | e 180 | siamo 181 | siete 182 | sia 183 | siate 184 | siano 185 | sarà 186 | sarai 187 | sarò 188 | saro 189 | saremo 190 | sarete 191 | saranno 192 | sarei 193 | saresti 194 | sarebbe 195 | saremmo 196 | sareste 197 | sarebbero 198 | ero 199 | eri 200 | era 201 | eravamo 202 | eravate 203 | erano 204 | fui 205 | fosti 206 | fu 207 | fummo 208 | foste 209 | furono 210 | fossi 211 | fosse 212 | fossimo 213 | fossero 214 | essendo 215 | faccio 216 | fai 217 | facciamo 218 | fanno 219 | faccia 220 | facciate 221 | facciano 222 | farà 223 | farai 224 | farò 225 | faremo 226 | farete 227 | faranno 228 | farei 229 | faresti 230 | farebbe 231 | faremmo 232 | fareste 233 | farebbero 234 | facevo 235 | facevi 236 | faceva 237 | facevamo 238 | facevate 239 | facevano 240 | feci 241 | facesti 242 | fece 243 | facemmo 244 | faceste 245 | fecero 246 | facessi 247 | facesse 248 | facessimo 249 | facessero 250 | facendo 251 | sto 252 | stai 253 | sta 254 | stiamo 255 | stanno 256 | stia 257 | stiate 258 | stiano 259 | starà 260 | starai 261 | starò 262 | staremo 263 | starete 264 | staranno 265 | starei 266 | staresti 267 | starebbe 268 | staremmo 269 | stareste 270 | starebbero 271 | stavo 272 | stavi 273 | stava 274 | stavamo 275 | stavate 276 | stavano 277 | stetti 278 | stesti 279 | stette 280 | stemmo 281 | steste 282 | stettero 283 | stessi 284 | stesse 285 | stessimo 286 | stessero 287 | stando 288 | -------------------------------------------------------------------------------- /goose/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import time 24 | import hashlib 25 | import re 26 | import os 27 | import goose 28 | import codecs 29 | import urlparse 30 | 31 | 32 | class BuildURL(object): 33 | def __init__(self, url, finalurl=None): 34 | self.url = url 35 | self.finalurl = finalurl 36 | 37 | def getHostname(self, o): 38 | if o.hostname: 39 | return o.hotname 40 | elif self.finalurl: 41 | oo = urlparse(self.finalurl) 42 | if oo.hostname: 43 | return oo.hostname 44 | return None 45 | 46 | def getScheme(self, o): 47 | if o.scheme: 48 | return o.scheme 49 | elif self.finalurl: 50 | oo = urlparse(self.finalurl) 51 | if oo.scheme: 52 | return oo.scheme 53 | return 'http' 54 | 55 | def getUrl(self): 56 | """\ 57 | 58 | """ 59 | url_obj = urlparse(self.url) 60 | scheme = self.getScheme(url_obj) 61 | hostname = self.getHostname(url_obj) 62 | 63 | 64 | class FileHelper(object): 65 | 66 | @classmethod 67 | def loadResourceFile(self, filename): 68 | if not os.path.isabs('filename'): 69 | dirpath = os.path.dirname(goose.__file__) 70 | path = os.path.join(dirpath, 'resources', filename) 71 | else: 72 | path = filename 73 | try: 74 | f = codecs.open(path, 'r', 'utf-8') 75 | content = f.read() 76 | f.close() 77 | return content 78 | except IOError: 79 | raise IOError("Couldn't open file %s" % path) 80 | 81 | 82 | class ParsingCandidate(object): 83 | 84 | def __init__(self, urlString, link_hash): 85 | self.urlString = self.url = urlString 86 | self.link_hash = link_hash 87 | 88 | 89 | class RawHelper(object): 90 | @classmethod 91 | def get_parsing_candidate(self, url, raw_html): 92 | if isinstance(raw_html, unicode): 93 | raw_html = raw_html.encode('utf-8') 94 | link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) 95 | return ParsingCandidate(url, link_hash) 96 | 97 | 98 | class URLHelper(object): 99 | @classmethod 100 | def get_parsing_candidate(self, url_to_crawl): 101 | # replace shebang is urls 102 | final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ 103 | if '#!' in url_to_crawl else url_to_crawl 104 | link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) 105 | return ParsingCandidate(final_url, link_hash) 106 | 107 | 108 | class StringReplacement(object): 109 | 110 | def __init__(self, pattern, replaceWith): 111 | self.pattern = pattern 112 | self.replaceWith = replaceWith 113 | 114 | def replaceAll(self, string): 115 | if not string: 116 | return u'' 117 | return string.replace(self.pattern, self.replaceWith) 118 | 119 | 120 | class ReplaceSequence(object): 121 | 122 | def __init__(self): 123 | self.replacements = [] 124 | 125 | #@classmethod 126 | def create(self, firstPattern, replaceWith=None): 127 | result = StringReplacement(firstPattern, replaceWith or u'') 128 | self.replacements.append(result) 129 | return self 130 | 131 | def append(self, pattern, replaceWith=None): 132 | return self.create(pattern, replaceWith) 133 | 134 | def replaceAll(self, string): 135 | if not string: 136 | return u'' 137 | 138 | mutatedString = string 139 | 140 | for rp in self.replacements: 141 | mutatedString = rp.replaceAll(mutatedString) 142 | return mutatedString 143 | -------------------------------------------------------------------------------- /goose/utils/images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import hashlib 24 | import os 25 | import urllib2 26 | from PIL import Image 27 | from goose.utils.encoding import smart_str 28 | from goose.image import ImageDetails 29 | from goose.image import LocallyStoredImage 30 | 31 | 32 | class ImageUtils(object): 33 | 34 | @classmethod 35 | def get_image_dimensions(self, identify_program, path): 36 | image_details = ImageDetails() 37 | try: 38 | image = Image.open(path) 39 | image_details.set_mime_type(image.format) 40 | width, height = image.size 41 | image_details.set_width(width) 42 | image_details.set_height(height) 43 | except IOError: 44 | image_details.set_mime_type('NA') 45 | return image_details 46 | 47 | @classmethod 48 | def store_image(self, http_client, link_hash, src, config): 49 | """\ 50 | Writes an image src http string to disk as a temporary file 51 | and returns the LocallyStoredImage object 52 | that has the info you should need on the image 53 | """ 54 | # check for a cache hit already on disk 55 | image = self.read_localfile(link_hash, src, config) 56 | if image: 57 | return image 58 | 59 | # no cache found download the image 60 | data = self.fetch(http_client, src) 61 | if data: 62 | image = self.write_localfile(data, link_hash, src, config) 63 | if image: 64 | return image 65 | 66 | return None 67 | 68 | @classmethod 69 | def get_mime_type(self, image_details): 70 | mime_type = image_details.get_mime_type().lower() 71 | mimes = { 72 | 'png': '.png', 73 | 'jpg': '.jpg', 74 | 'jpeg': '.jpg', 75 | 'gif': '.gif', 76 | } 77 | return mimes.get(mime_type, 'NA') 78 | 79 | @classmethod 80 | def read_localfile(self, link_hash, src, config): 81 | local_image_name = self.get_localfile_name(link_hash, src, config) 82 | if os.path.isfile(local_image_name): 83 | identify = config.imagemagick_identify_path 84 | image_details = self.get_image_dimensions(identify, local_image_name) 85 | file_extension = self.get_mime_type(image_details) 86 | bytes = os.path.getsize(local_image_name) 87 | return LocallyStoredImage( 88 | src=src, 89 | local_filename=local_image_name, 90 | link_hash=link_hash, 91 | bytes=bytes, 92 | file_extension=file_extension, 93 | height=image_details.get_height(), 94 | width=image_details.get_width() 95 | ) 96 | return None 97 | 98 | @classmethod 99 | def write_localfile(self, entity, link_hash, src, config): 100 | local_path = self.get_localfile_name(link_hash, src, config) 101 | f = open(local_path, 'wb') 102 | f.write(entity) 103 | f.close() 104 | return self.read_localfile(link_hash, src, config) 105 | 106 | @classmethod 107 | def get_localfile_name(self, link_hash, src, config): 108 | image_hash = hashlib.md5(smart_str(src)).hexdigest() 109 | return os.path.join(config.local_storage_path, '%s_%s' % (link_hash, image_hash)) 110 | 111 | @classmethod 112 | def clean_src_string(self, src): 113 | return src.replace(" ", "%20") 114 | 115 | @classmethod 116 | def fetch(self, http_client, src): 117 | try: 118 | req = urllib2.Request(src) 119 | f = urllib2.urlopen(req) 120 | data = f.read() 121 | return data 122 | except Exception: 123 | return None 124 | -------------------------------------------------------------------------------- /goose/extractors/metas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | import re 25 | from urlparse import urljoin 26 | from urlparse import urlparse 27 | 28 | from goose.extractors import BaseExtractor 29 | 30 | 31 | RE_LANG = r'^[A-Za-z]{2}$' 32 | 33 | 34 | class MetasExtractor(BaseExtractor): 35 | 36 | def get_domain(self): 37 | if self.article.final_url: 38 | o = urlparse(self.article.final_url) 39 | return o.hostname 40 | return None 41 | 42 | def get_favicon(self): 43 | """\ 44 | Extract the favicon from a website 45 | http://en.wikipedia.org/wiki/Favicon 46 | 47 | 48 | """ 49 | kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} 50 | meta = self.parser.getElementsByTag(self.article.doc, **kwargs) 51 | if meta: 52 | favicon = self.parser.getAttribute(meta[0], 'href') 53 | return favicon 54 | return '' 55 | 56 | def get_canonical_link(self): 57 | """\ 58 | if the article has meta canonical link set in the url 59 | """ 60 | if self.article.final_url: 61 | kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} 62 | meta = self.parser.getElementsByTag(self.article.doc, **kwargs) 63 | if meta is not None and len(meta) > 0: 64 | href = self.parser.getAttribute(meta[0], 'href') 65 | if href: 66 | href = href.strip() 67 | o = urlparse(href) 68 | if not o.hostname: 69 | z = urlparse(self.article.final_url) 70 | domain = '%s://%s' % (z.scheme, z.hostname) 71 | href = urljoin(domain, href) 72 | return href 73 | return self.article.final_url 74 | 75 | def get_meta_lang(self): 76 | """\ 77 | Extract content language from meta 78 | """ 79 | # we have a lang attribute in html 80 | attr = self.parser.getAttribute(self.article.doc, attr='lang') 81 | if attr is None: 82 | # look up for a Content-Language in meta 83 | items = [ 84 | {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, 85 | {'tag': 'meta', 'attr': 'name', 'value': 'lang'} 86 | ] 87 | for item in items: 88 | meta = self.parser.getElementsByTag(self.article.doc, **item) 89 | if meta: 90 | attr = self.parser.getAttribute(meta[0], attr='content') 91 | break 92 | 93 | if attr: 94 | value = attr[:2] 95 | if re.search(RE_LANG, value): 96 | return value.lower() 97 | 98 | return None 99 | 100 | def get_meta_content(self, metaName): 101 | """\ 102 | Extract a given meta content form document 103 | """ 104 | meta = self.parser.css_select(self.article.doc, metaName) 105 | content = None 106 | 107 | if meta is not None and len(meta) > 0: 108 | content = self.parser.getAttribute(meta[0], 'content') 109 | 110 | if content: 111 | return content.strip() 112 | 113 | return '' 114 | 115 | def get_meta_description(self): 116 | """\ 117 | if the article has meta description set in the source, use that 118 | """ 119 | return self.get_meta_content("meta[name=description]") 120 | 121 | def get_meta_keywords(self): 122 | """\ 123 | if the article has meta keywords set in the source, use that 124 | """ 125 | return self.get_meta_content("meta[name=keywords]") 126 | 127 | def extract(self): 128 | return { 129 | "description": self.get_meta_description(), 130 | "keywords": self.get_meta_keywords(), 131 | "lang": self.get_meta_lang(), 132 | "favicon": self.get_favicon(), 133 | "canonical": self.get_canonical_link(), 134 | "domain": self.get_domain() 135 | } 136 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-ru.txt: -------------------------------------------------------------------------------- 1 | а 2 | е 3 | и 4 | ж 5 | м 6 | о 7 | на 8 | не 9 | ни 10 | об 11 | но 12 | он 13 | мне 14 | мои 15 | мож 16 | она 17 | они 18 | оно 19 | мной 20 | много 21 | многочисленное 22 | многочисленная 23 | многочисленные 24 | многочисленный 25 | мною 26 | мой 27 | мог 28 | могут 29 | можно 30 | может 31 | можхо 32 | мор 33 | моя 34 | моё 35 | мочь 36 | над 37 | нее 38 | оба 39 | нам 40 | нем 41 | нами 42 | ними 43 | мимо 44 | немного 45 | одной 46 | одного 47 | менее 48 | однажды 49 | однако 50 | меня 51 | нему 52 | меньше 53 | ней 54 | наверху 55 | него 56 | ниже 57 | мало 58 | надо 59 | один 60 | одиннадцать 61 | одиннадцатый 62 | назад 63 | наиболее 64 | недавно 65 | миллионов 66 | недалеко 67 | между 68 | низко 69 | меля 70 | нельзя 71 | нибудь 72 | непрерывно 73 | наконец 74 | никогда 75 | никуда 76 | нас 77 | наш 78 | нет 79 | нею 80 | неё 81 | них 82 | мира 83 | наша 84 | наше 85 | наши 86 | ничего 87 | начала 88 | нередко 89 | несколько 90 | обычно 91 | опять 92 | около 93 | мы 94 | ну 95 | нх 96 | от 97 | отовсюду 98 | особенно 99 | нужно 100 | очень 101 | отсюда 102 | в 103 | во 104 | вон 105 | вниз 106 | внизу 107 | вокруг 108 | вот 109 | восемнадцать 110 | восемнадцатый 111 | восемь 112 | восьмой 113 | вверх 114 | вам 115 | вами 116 | важное 117 | важная 118 | важные 119 | важный 120 | вдали 121 | везде 122 | ведь 123 | вас 124 | ваш 125 | ваша 126 | ваше 127 | ваши 128 | впрочем 129 | весь 130 | вдруг 131 | вы 132 | все 133 | второй 134 | всем 135 | всеми 136 | времени 137 | время 138 | всему 139 | всего 140 | всегда 141 | всех 142 | всею 143 | всю 144 | вся 145 | всё 146 | всюду 147 | г 148 | год 149 | говорил 150 | говорит 151 | года 152 | году 153 | где 154 | да 155 | ее 156 | за 157 | из 158 | ли 159 | же 160 | им 161 | до 162 | по 163 | ими 164 | под 165 | иногда 166 | довольно 167 | именно 168 | долго 169 | позже 170 | более 171 | должно 172 | пожалуйста 173 | значит 174 | иметь 175 | больше 176 | пока 177 | ему 178 | имя 179 | пор 180 | пора 181 | потом 182 | потому 183 | после 184 | почему 185 | почти 186 | посреди 187 | ей 188 | два 189 | две 190 | двенадцать 191 | двенадцатый 192 | двадцать 193 | двадцатый 194 | двух 195 | его 196 | дел 197 | или 198 | без 199 | день 200 | занят 201 | занята 202 | занято 203 | заняты 204 | действительно 205 | давно 206 | девятнадцать 207 | девятнадцатый 208 | девять 209 | девятый 210 | даже 211 | алло 212 | жизнь 213 | далеко 214 | близко 215 | здесь 216 | дальше 217 | для 218 | лет 219 | зато 220 | даром 221 | первый 222 | перед 223 | затем 224 | зачем 225 | лишь 226 | десять 227 | десятый 228 | ею 229 | её 230 | их 231 | бы 232 | еще 233 | при 234 | был 235 | про 236 | процентов 237 | против 238 | просто 239 | бывает 240 | бывь 241 | если 242 | люди 243 | была 244 | были 245 | было 246 | будем 247 | будет 248 | будете 249 | будешь 250 | прекрасно 251 | буду 252 | будь 253 | будто 254 | будут 255 | ещё 256 | пятнадцать 257 | пятнадцатый 258 | друго 259 | другое 260 | другой 261 | другие 262 | другая 263 | других 264 | есть 265 | пять 266 | быть 267 | лучше 268 | пятый 269 | к 270 | ком 271 | конечно 272 | кому 273 | кого 274 | когда 275 | которой 276 | которого 277 | которая 278 | которые 279 | который 280 | которых 281 | кем 282 | каждое 283 | каждая 284 | каждые 285 | каждый 286 | кажется 287 | как 288 | какой 289 | какая 290 | кто 291 | кроме 292 | куда 293 | кругом 294 | с 295 | т 296 | у 297 | я 298 | та 299 | те 300 | уж 301 | со 302 | то 303 | том 304 | снова 305 | тому 306 | совсем 307 | того 308 | тогда 309 | тоже 310 | собой 311 | тобой 312 | собою 313 | тобою 314 | сначала 315 | только 316 | уметь 317 | тот 318 | тою 319 | хорошо 320 | хотеть 321 | хочешь 322 | хоть 323 | хотя 324 | свое 325 | свои 326 | твой 327 | своей 328 | своего 329 | своих 330 | свою 331 | твоя 332 | твоё 333 | раз 334 | уже 335 | сам 336 | там 337 | тем 338 | чем 339 | сама 340 | сами 341 | теми 342 | само 343 | рано 344 | самом 345 | самому 346 | самой 347 | самого 348 | семнадцать 349 | семнадцатый 350 | самим 351 | самими 352 | самих 353 | саму 354 | семь 355 | чему 356 | раньше 357 | сейчас 358 | чего 359 | сегодня 360 | себе 361 | тебе 362 | сеаой 363 | человек 364 | разве 365 | теперь 366 | себя 367 | тебя 368 | седьмой 369 | спасибо 370 | слишком 371 | так 372 | такое 373 | такой 374 | такие 375 | также 376 | такая 377 | сих 378 | тех 379 | чаще 380 | четвертый 381 | через 382 | часто 383 | шестой 384 | шестнадцать 385 | шестнадцатый 386 | шесть 387 | четыре 388 | четырнадцать 389 | четырнадцатый 390 | сколько 391 | сказал 392 | сказала 393 | сказать 394 | ту 395 | ты 396 | три 397 | эта 398 | эти 399 | что 400 | это 401 | чтоб 402 | этом 403 | этому 404 | этой 405 | этого 406 | чтобы 407 | этот 408 | стал 409 | туда 410 | этим 411 | этими 412 | рядом 413 | тринадцать 414 | тринадцатый 415 | этих 416 | третий 417 | тут 418 | эту 419 | суть 420 | чуть 421 | тысяч 422 | -------------------------------------------------------------------------------- /goose/extractors/videos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | from goose.extractors import BaseExtractor 25 | from goose.video import Video 26 | 27 | VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video'] 28 | VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego'] 29 | 30 | 31 | class VideoExtractor(BaseExtractor): 32 | """\ 33 | Extracts a list of video from Article top node 34 | """ 35 | def __init__(self, config, article): 36 | super(VideoExtractor, self).__init__(config, article) 37 | 38 | # candidates 39 | self.candidates = [] 40 | 41 | # movies 42 | self.movies = [] 43 | 44 | def get_embed_code(self, node): 45 | return "".join([line.strip() for line in self.parser.nodeToString(node).splitlines()]) 46 | 47 | def get_embed_type(self, node): 48 | return self.parser.getTag(node) 49 | 50 | def get_width(self, node): 51 | return self.parser.getAttribute(node, 'width') 52 | 53 | def get_height(self, node): 54 | return self.parser.getAttribute(node, 'height') 55 | 56 | def get_src(self, node): 57 | return self.parser.getAttribute(node, 'src') 58 | 59 | def get_provider(self, src): 60 | if src: 61 | for provider in VIDEO_PROVIDERS: 62 | if provider in src: 63 | return provider 64 | return None 65 | 66 | def get_video(self, node): 67 | """ 68 | Create a video object from a video embed 69 | """ 70 | video = Video() 71 | video.embed_code = self.get_embed_code(node) 72 | video.embed_type = self.get_embed_type(node) 73 | video.width = self.get_width(node) 74 | video.height = self.get_height(node) 75 | video.src = self.get_src(node) 76 | video.provider = self.get_provider(video.src) 77 | return video 78 | 79 | def get_iframe_tag(self, node): 80 | return self.get_video(node) 81 | 82 | def get_video_tag(self, node): 83 | """extract html video tags""" 84 | return Video() 85 | 86 | def get_embed_tag(self, node): 87 | # embed node may have an object node as parent 88 | # in this case we want to retrieve the object node 89 | # instead of the embed 90 | parent = self.parser.getParent(node) 91 | if parent is not None: 92 | parent_tag = self.parser.getTag(parent) 93 | if parent_tag == 'object': 94 | return self.get_object_tag(node) 95 | return self.get_video(node) 96 | 97 | def get_object_tag(self, node): 98 | # test if object tag has en embed child 99 | # in this case we want to remove the embed from 100 | # the candidate list to avoid parsing it twice 101 | child_embed_tag = self.parser.getElementsByTag(node, 'embed') 102 | if child_embed_tag and child_embed_tag[0] in self.candidates: 103 | self.candidates.remove(child_embed_tag[0]) 104 | 105 | # get the object source 106 | # if wa don't have a src node don't coninue 107 | src_node = self.parser.getElementsByTag(node, tag="param", attr="name", value="movie") 108 | if not src_node: 109 | return None 110 | 111 | src = self.parser.getAttribute(src_node[0], "value") 112 | 113 | # check provider 114 | provider = self.get_provider(src) 115 | if not provider: 116 | return None 117 | 118 | video = self.get_video(node) 119 | video.provider = provider 120 | video.src = src 121 | return video 122 | 123 | def get_videos(self): 124 | # candidates node 125 | self.candidates = self.parser.getElementsByTags(self.article.top_node, VIDEOS_TAGS) 126 | 127 | # loop all candidates 128 | # and check if src attribute belongs to a video provider 129 | for candidate in self.candidates: 130 | tag = self.parser.getTag(candidate) 131 | attr = "get_%s_tag" % tag 132 | if hasattr(self, attr): 133 | movie = getattr(self, attr)(candidate) 134 | if movie is not None and movie.provider is not None: 135 | self.movies.append(movie) 136 | 137 | # append movies list to article 138 | self.article.movies = list(self.movies) 139 | --------------------------------------------------------------------------------